diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..ff9067195b05d4fc672ab02303bc584bdb09ddb6 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +PuzzleTuning/Counterpart[[:space:]]PreTrain[[:space:]]Methods/gcmae/nohup.out filter=lfs diff=lfs merge=lfs -text diff --git a/PuzzleTuning/Archive/PuzzleTuning_demoset.zip b/PuzzleTuning/Archive/PuzzleTuning_demoset.zip new file mode 100644 index 0000000000000000000000000000000000000000..0f68e9c28f797862af4c45e71ff446aa933aabf4 --- /dev/null +++ b/PuzzleTuning/Archive/PuzzleTuning_demoset.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:363cbdc5f8e944f99542a5727d4b2457dd79bbfca7eb70d76857de4b8be92858 +size 4987321 diff --git a/PuzzleTuning/Archive/build_CPIA_dataset.sh b/PuzzleTuning/Archive/build_CPIA_dataset.sh new file mode 100644 index 0000000000000000000000000000000000000000..8705457a4af70f46a90bc8f30c961d904618b11b --- /dev/null +++ b/PuzzleTuning/Archive/build_CPIA_dataset.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# go to the dataset location + +# altering the zip files + +zip -F L.zip --out L_Scale.zip +zip -FF L_Scale.zip --out L.zip -fz +zip -F M.zip --out M_Scale.zip +zip -FF M_Scale.zip --out M.zip -fz + +rm -f L_Scale.zip +rm -f L.z01 +rm -f M_Scale.zip +rm -f M.z01 +rm -f M.z02 + +# build a directory of datasets +mkdir datasets +mv L.zip datasets +mv M.zip datasets +mv S.zip datasets + +cd datasets +unzip L.zip +unzip M.zip +unzip S.zip + +rm -f L.zip +rm -f M.zip +rm -f S.zip + +mkdir All +cp -r L/* All/ & +cp -r M/* All/ & +cp -r S/* All/ \ No newline at end of file diff --git a/PuzzleTuning/Archive/warwick_CLS.zip b/PuzzleTuning/Archive/warwick_CLS.zip new file mode 100644 index 0000000000000000000000000000000000000000..7f15149809e24a4ad39828243f39bcda306fce0e --- /dev/null +++ b/PuzzleTuning/Archive/warwick_CLS.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9813cdc86e2420476e06638965df6040cec0197b148be786f0bf88b020e445f6 +size 15098031 diff --git a/PuzzleTuning/Backbone/GetPromptModel.py b/PuzzleTuning/Backbone/GetPromptModel.py new file mode 100644 index 0000000000000000000000000000000000000000..43f281b8b79494f6c2cb47a40ec8dcf9173f7bb1 --- /dev/null +++ b/PuzzleTuning/Backbone/GetPromptModel.py @@ -0,0 +1,87 @@ +""" +build_promptmodel Script ver: Oct 17th 14:20 + +""" + +try: + from Backbone.VPT_structure import * +except: + from Backbone.VPT_structure import * + + +def build_promptmodel(num_classes=1000, edge_size=224, model_idx='ViT', patch_size=16, + Prompt_Token_num=20, VPT_type="Deep", prompt_state_dict=None, base_state_dict='timm'): + """ + following the https://github.com/sagizty/VPT + this build the VPT (prompt version of ViT), with additional prompt tokens, + each layer the information become [B, N_patch + N_prompt, Dim] + + During training only the prompt tokens and the head layer are + set to be learnable while the rest of Transformer layers are frozen + + # VPT_type = "Shallow" / "Deep" + - Shallow: concatenate N_prompt of prompt tokens before the first Transformer Encoder block, + each layer the information become [B, N_patch + N_prompt, Dim] + - Deep: concatenate N_prompt of prompt tokens to each Transformer Encoder block, + this will replace the output prompt tokens learnt form previous encoder. + """ + + if model_idx[0:3] == 'ViT': + + if base_state_dict is None: + basic_state_dict = None + + elif type(base_state_dict) == str: + if base_state_dict == 'timm': + # ViT_Prompt + import timm + # from pprint import pprint + # model_names = timm.list_models('*vit*') + # pprint(model_names) + + basic_model = timm.create_model('vit_base_patch' + str(patch_size) + '_' + str(edge_size), + pretrained=True) + basic_state_dict = basic_model.state_dict() + print('in prompt model building, timm ViT loaded for base_state_dict') + + else: + basic_state_dict = None + print('in prompt model building, no vaild str for base_state_dict') + + else: # state dict: collections.OrderedDict + basic_state_dict = base_state_dict + print('in prompt model building, a .pth base_state_dict loaded') + + model = VPT_ViT(img_size=edge_size, patch_size=patch_size, Prompt_Token_num=Prompt_Token_num, + VPT_type=VPT_type, basic_state_dict=basic_state_dict) + + model.New_CLS_head(num_classes) + + if prompt_state_dict is not None: + try: + model.load_prompt(prompt_state_dict) + except: + print('erro in .pth prompt_state_dict') + else: + print('in prompt model building, a .pth prompt_state_dict loaded') + + model.Freeze() + else: + print("The model is not difined in the Prompt script!!") + return -1 + + try: + img = torch.randn(1, 3, edge_size, edge_size) + preds = model(img) # (1, class_number) + print('Build VPT model with in/out shape: ', img.shape, ' -> ', preds.shape) + + except: + print("Problem exist in the model defining process!!") + return -1 + else: + print('model is ready now!') + return model + + +if __name__ == '__main__': + model = build_promptmodel(prompt_state_dict=None, base_state_dict='timm', num_classes=0) diff --git a/PuzzleTuning/Backbone/ResHybrid.py b/PuzzleTuning/Backbone/ResHybrid.py new file mode 100644 index 0000000000000000000000000000000000000000..edbff2272721852f0f47ac6cc948c39b8ec9a344 --- /dev/null +++ b/PuzzleTuning/Backbone/ResHybrid.py @@ -0,0 +1,481 @@ +""" +MSHT + +Models ver: OCT 27th 20:00 official release + +by the authors, check our github page: +https://github.com/sagizty/Multi-Stage-Hybrid-Transformer + + +ResNet stages' feature map + +# input = 3, 384, 384 +torch.Size([1, 256, 96, 96]) +torch.Size([1, 512, 48, 48]) +torch.Size([1, 1024, 24, 24]) +torch.Size([1, 2048, 12, 12]) +torch.Size([1, 1000]) + +# input = 3, 224, 224 +torch.Size([1, 256, 56, 56]) +torch.Size([1, 512, 28, 28]) +torch.Size([1, 1024, 14, 14]) +torch.Size([1, 2048, 7, 7]) +torch.Size([1, 1000]) + +ref +https://note.youdao.com/ynoteshare1/index.html?id=5a7dbe1a71713c317062ddeedd97d98e&type=note +""" +import torch +from torch import nn +from functools import partial +from torchsummary import summary +import os +from Backbone import Transformer_blocks + + +# ResNet Bottleneck_block_constructor +class Bottleneck_block_constructor(nn.Module): + + extention = 4 + + # 定义初始化的网络和参数 + def __init__(self, inplane, midplane, stride, downsample=None): + super(Bottleneck_block_constructor, self).__init__() + + outplane = midplane * self.extention + + self.conv1 = nn.Conv2d(inplane, midplane, kernel_size=1, stride=stride, bias=False) + self.bn1 = nn.BatchNorm2d(midplane) + + self.conv2 = nn.Conv2d(midplane, midplane, kernel_size=3, stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(midplane) + + self.conv3 = nn.Conv2d(midplane, outplane, kernel_size=1, stride=1, bias=False) + self.bn3 = nn.BatchNorm2d(midplane * self.extention) + + self.relu = nn.ReLU(inplace=False) + + self.downsample = downsample + self.stride = stride + + def forward(self, x): + + out = self.relu(self.bn1(self.conv1(x))) + out = self.relu(self.bn2(self.conv2(out))) + out = self.relu(self.bn3(self.conv3(out))) + + if self.downsample is not None: + residual = self.downsample(x) + else: + residual = x + + out += residual + + out = self.relu(out) + + return out + + +# Hybrid_backbone of ResNets +class Hybrid_backbone_4(nn.Module): + + def __init__(self, block_constructor, bottleneck_channels_setting=None, identity_layers_setting=None, + stage_stride_setting=None, fc_num_classes=None, feature_idx=None): + + if bottleneck_channels_setting is None: + bottleneck_channels_setting = [64, 128, 256, 512] + if identity_layers_setting is None: + identity_layers_setting = [3, 4, 6, 3] + if stage_stride_setting is None: + stage_stride_setting = [1, 2, 2, 2] + + self.inplane = 64 + self.fc_num_classes = fc_num_classes + self.feature_idx = feature_idx + + super(Hybrid_backbone_4, self).__init__() + + self.block_constructor = block_constructor # Bottleneck_block_constructor + self.bcs = bottleneck_channels_setting # [64, 128, 256, 512] + self.ils = identity_layers_setting # [3, 4, 6, 3] + self.sss = stage_stride_setting # [1, 2, 2, 2] + + # stem + # alter the RGB pic chanel to match inplane + self.conv1 = nn.Conv2d(3, self.inplane, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = nn.BatchNorm2d(self.inplane) + self.relu = nn.ReLU() + self.maxpool = nn.MaxPool2d(kernel_size=3, padding=1, stride=2) + + # ResNet stages + self.layer1 = self.make_stage_layer(self.block_constructor, self.bcs[0], self.ils[0], self.sss[0]) + self.layer2 = self.make_stage_layer(self.block_constructor, self.bcs[1], self.ils[1], self.sss[1]) + self.layer3 = self.make_stage_layer(self.block_constructor, self.bcs[2], self.ils[2], self.sss[2]) + self.layer4 = self.make_stage_layer(self.block_constructor, self.bcs[3], self.ils[3], self.sss[3]) + + # cls head + if self.fc_num_classes is not None: + self.avgpool = nn.AvgPool2d(7) + self.fc = nn.Linear(512 * self.block_constructor.extention, fc_num_classes) + + def forward(self, x): + + # stem + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + stem_out = self.maxpool(out) + + # Resnet block of 4 stages + stage1_out = self.layer1(stem_out) + stage2_out = self.layer2(stage1_out) + stage3_out = self.layer3(stage2_out) + stage4_out = self.layer4(stage3_out) + + if self.fc_num_classes is not None: + # connect to cls head mlp if asked + fc_out = self.avgpool(stage4_out) + fc_out = torch.flatten(fc_out, 1) + fc_out = self.fc(fc_out) + + # get what we need for different usage + if self.feature_idx == 'stages': + if self.fc_num_classes is not None: + return stage1_out, stage2_out, stage3_out, stage4_out, fc_out + else: + return stage1_out, stage2_out, stage3_out, stage4_out + elif self.feature_idx == 'features': + if self.fc_num_classes is not None: + return stem_out, stage1_out, stage2_out, stage3_out, stage4_out, fc_out + else: + return stem_out, stage1_out, stage2_out, stage3_out, stage4_out + else: # self.feature_idx is None + if self.fc_num_classes is not None: + return fc_out + else: + return stage4_out + + def make_stage_layer(self, block_constractor, midplane, block_num, stride=1): + """ + block: + midplane:usually = output chanel/4 + block_num: + stride:stride of the ResNet Conv Block + """ + + block_list = [] + + outplane = midplane * block_constractor.extention # extention + + if stride != 1 or self.inplane != outplane: + downsample = nn.Sequential( + nn.Conv2d(self.inplane, outplane, stride=stride, kernel_size=1, bias=False), + nn.BatchNorm2d(midplane * block_constractor.extention) + ) + else: + downsample = None + + # Conv Block + conv_block = block_constractor(self.inplane, midplane, stride=stride, downsample=downsample) + block_list.append(conv_block) + + self.inplane = outplane # update inplane for the next stage + + # Identity Block + for i in range(1, block_num): + block_list.append(block_constractor(self.inplane, midplane, stride=1, downsample=None)) + + return nn.Sequential(*block_list) # stack blocks + + +class Hybrid_backbone_3(nn.Module): # 3 stages version + + def __init__(self, block_constructor, bottleneck_channels_setting=None, identity_layers_setting=None, + stage_stride_setting=None, fc_num_classes=None, feature_idx=None): + + if bottleneck_channels_setting is None: + bottleneck_channels_setting = [64, 128, 256] + if identity_layers_setting is None: + identity_layers_setting = [3, 4, 6] + if stage_stride_setting is None: + stage_stride_setting = [1, 2, 2] + + self.inplane = 64 + self.fc_num_classes = fc_num_classes + self.feature_idx = feature_idx + + super(Hybrid_backbone_3, self).__init__() + + self.block_constructor = block_constructor # Bottleneck_block_constructor + self.bcs = bottleneck_channels_setting # [64, 128, 256] + self.ils = identity_layers_setting # [3, 4, 6] + self.sss = stage_stride_setting # [1, 2, 2] + + # stem + self.conv1 = nn.Conv2d(3, self.inplane, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = nn.BatchNorm2d(self.inplane) + self.relu = nn.ReLU() + self.maxpool = nn.MaxPool2d(kernel_size=3, padding=1, stride=2) + + # ResNet 3 stages + self.layer1 = self.make_stage_layer(self.block_constructor, self.bcs[0], self.ils[0], self.sss[0]) + self.layer2 = self.make_stage_layer(self.block_constructor, self.bcs[1], self.ils[1], self.sss[1]) + self.layer3 = self.make_stage_layer(self.block_constructor, self.bcs[2], self.ils[2], self.sss[2]) + + if self.fc_num_classes is not None: + self.avgpool = nn.AvgPool2d(24) # 224-14 384-24 + self.fc = nn.Linear(self.bcs[-1] * self.block_constructor.extention, fc_num_classes) + + def forward(self, x): + # stem:conv+bn+relu+maxpool + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + stem_out = self.maxpool(out) + + # Resnet 3 stages + stage1_out = self.layer1(stem_out) + stage2_out = self.layer2(stage1_out) + stage3_out = self.layer3(stage2_out) + + if self.fc_num_classes is not None: + fc_out = self.avgpool(stage3_out) + fc_out = torch.flatten(fc_out, 1) + fc_out = self.fc(fc_out) + + if self.feature_idx == 'stages': + if self.fc_num_classes is not None: + return stage1_out, stage2_out, stage3_out, fc_out + else: + return stage1_out, stage2_out, stage3_out + elif self.feature_idx == 'features': + if self.fc_num_classes is not None: + return stem_out, stage1_out, stage2_out, stage3_out, fc_out + else: + return stem_out, stage1_out, stage2_out, stage3_out + else: # self.feature_idx is None + if self.fc_num_classes is not None: + return fc_out + else: + return stage3_out + + def make_stage_layer(self, block_constractor, midplane, block_num, stride=1): + """ + block: + midplane: + block_num: + stride: + """ + + block_list = [] + + outplane = midplane * block_constractor.extention # extention + + if stride != 1 or self.inplane != outplane: + downsample = nn.Sequential( + nn.Conv2d(self.inplane, outplane, stride=stride, kernel_size=1, bias=False), + nn.BatchNorm2d(midplane * block_constractor.extention) + ) + else: + downsample = None + + # Conv Block + conv_block = block_constractor(self.inplane, midplane, stride=stride, downsample=downsample) + block_list.append(conv_block) + + self.inplane = outplane + + # Identity Block + for i in range(1, block_num): + block_list.append(block_constractor(self.inplane, midplane, stride=1, downsample=None)) + + return nn.Sequential(*block_list) + + +def Hybrid_a(backbone, img_size=224, patch_size=1, in_chans=3, num_classes=1000, embed_dim=768, depth=8, + num_heads=12, mlp_ratio=4., qkv_bias=True, representation_size=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=None, act_layer=None): + # directly stack CNNs and Transformer blocks + embed_layer = partial(Transformer_blocks.Hybrid_feature_map_Embed, backbone=backbone) + + Hybrid_model = Transformer_blocks.VisionTransformer(img_size, patch_size, in_chans, num_classes, embed_dim, depth, + num_heads, mlp_ratio, qkv_bias, representation_size, + drop_rate, attn_drop_rate, drop_path_rate, embed_layer, + norm_layer, act_layer) + + return Hybrid_model + + +def create_model(model_idx, edge_size, pretrained=True, num_classes=2, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., use_cls_token=True, use_pos_embedding=True, use_att_module='SimAM'): + """ + get one of MSHT models + + :param model_idx: the model we are going to use. by the format of Model_size_other_info + :param edge_size: the input edge size of the dataloder + :param pretrained: The backbone CNN is initiate randomly or by its official Pretrained models + :param num_classes: classification required number of your dataset + + :param drop_rate: The dropout layer's probility of proposed models + :param attn_drop_rate: The dropout layer(right after the MHSA block or MHGA block)'s probility of proposed models + :param drop_path_rate: The probility of stochastic depth + + :param use_cls_token: To use the class token + :param use_pos_embedding: To use the positional enbedding + :param use_att_module: To use which attention module in the FGD Focus block + # use_att_module in ['SimAM', 'CBAM', 'SE'] different attention module we applied in the ablation study + + :return: prepared model + """ + + if pretrained: + from torchvision import models + backbone_weights = models.resnet50(pretrained=True).state_dict() + # True for pretrained Resnet50 model, False will randomly initiate + else: + backbone_weights = None + + if model_idx[0:11] == 'Hybrid1_224' and edge_size == 224: # ablation study: no focus depth=8 edge_size == 224 + backbone = Hybrid_backbone_4(block_constructor=Bottleneck_block_constructor, + bottleneck_channels_setting=[64, 128, 256, 512], + identity_layers_setting=[3, 4, 6, 3], + stage_stride_setting=[1, 2, 2, 2], + fc_num_classes=None, + feature_idx=None) + + if pretrained: + try: + backbone.load_state_dict(backbone_weights, False) + except: + print("backbone not loaded") + else: + print("backbone loaded") + + model = Hybrid_a(backbone, img_size=edge_size, patch_size=1, in_chans=3, num_classes=num_classes, embed_dim=768, + depth=8, num_heads=12, mlp_ratio=4., qkv_bias=True, representation_size=None, + drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=drop_path_rate, + norm_layer=None, act_layer=None) + + elif model_idx[0:11] == 'Hybrid1_384' and edge_size == 384: # ablation study: no focus depth=8 edge_size == 384 + backbone = Hybrid_backbone_4(block_constructor=Bottleneck_block_constructor, + bottleneck_channels_setting=[64, 128, 256, 512], + identity_layers_setting=[3, 4, 6, 3], + stage_stride_setting=[1, 2, 2, 2], + fc_num_classes=None, + feature_idx=None) + + if pretrained: + try: + backbone.load_state_dict(backbone_weights, False) + except: + print("backbone not loaded") + else: + print("backbone loaded") + + model = Hybrid_a(backbone, img_size=edge_size, patch_size=1, in_chans=3, num_classes=num_classes, embed_dim=768, + depth=8, num_heads=12, mlp_ratio=4., qkv_bias=True, representation_size=None, + drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=drop_path_rate, + norm_layer=None, act_layer=None) + + elif model_idx[0:11] == 'Hybrid2_224' and edge_size == 224: # Proposed model ablation study: edge_size==224 + backbone = Hybrid_backbone_4(block_constructor=Bottleneck_block_constructor, + bottleneck_channels_setting=[64, 128, 256, 512], + identity_layers_setting=[3, 4, 6, 3], + stage_stride_setting=[1, 2, 2, 2], + fc_num_classes=None, + feature_idx='stages') + if pretrained: + try: + backbone.load_state_dict(backbone_weights, False) + except: + print("backbone not loaded") + else: + print("backbone loaded") + + model = Transformer_blocks.Stage_wise_hybrid_Transformer(backbone, num_classes=num_classes, + drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, + drop_path_rate=drop_path_rate, + use_cls_token=use_cls_token, + use_pos_embedding=use_pos_embedding, + use_att_module=use_att_module, + stage_size=(56, 28, 14, 7), + stage_dim=[256, 512, 1024, 2048]) + + elif model_idx[0:11] == 'Hybrid2_384' and edge_size == 384: # Proposed model 384 !!! + backbone = Hybrid_backbone_4(block_constructor=Bottleneck_block_constructor, + bottleneck_channels_setting=[64, 128, 256, 512], + identity_layers_setting=[3, 4, 6, 3], + stage_stride_setting=[1, 2, 2, 2], + fc_num_classes=None, + feature_idx='stages') + if pretrained: + try: + backbone.load_state_dict(backbone_weights, False) + except: + print("backbone not loaded") + else: + print("backbone loaded") + + model = Transformer_blocks.Stage_wise_hybrid_Transformer(backbone, num_classes=num_classes, + drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, + drop_path_rate=drop_path_rate, + use_cls_token=use_cls_token, + use_pos_embedding=use_pos_embedding, + use_att_module=use_att_module, + stage_size=(96, 48, 24, 12), + stage_dim=[256, 512, 1024, 2048]) + + elif model_idx[0:11] == 'Hybrid3_224' and edge_size == 224: # Proposed model ablation study: edge_size==224 + backbone = Hybrid_backbone_3(block_constructor=Bottleneck_block_constructor, + bottleneck_channels_setting=[64, 128, 256], + identity_layers_setting=[3, 4, 6], + stage_stride_setting=[1, 2, 2], + fc_num_classes=None, + feature_idx='stages') + if pretrained: + try: + backbone.load_state_dict(backbone_weights, False) + except: + print("backbone not loaded") + else: + print("backbone loaded") + + model = Transformer_blocks.Stage_wise_hybrid_Transformer(backbone, num_classes=num_classes, + drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, + drop_path_rate=drop_path_rate, + use_cls_token=use_cls_token, + use_pos_embedding=use_pos_embedding, + use_att_module=use_att_module, + stage_size=(56, 28, 14), + stage_dim=[256, 512, 1024]) + + elif model_idx[0:11] == 'Hybrid3_384' and edge_size == 384: # Proposed model 384 !!! + backbone = Hybrid_backbone_3(block_constructor=Bottleneck_block_constructor, + bottleneck_channels_setting=[64, 128, 256], + identity_layers_setting=[3, 4, 6], + stage_stride_setting=[1, 2, 2], + fc_num_classes=None, + feature_idx='stages') + if pretrained: + try: + backbone.load_state_dict(backbone_weights, False) + except: + print("backbone not loaded") + else: + print("backbone loaded") + + model = Transformer_blocks.Stage_wise_hybrid_Transformer(backbone, num_classes=num_classes, + drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, + drop_path_rate=drop_path_rate, + use_cls_token=use_cls_token, + use_pos_embedding=use_pos_embedding, + use_att_module=use_att_module, + stage_size=(96, 48, 24), + stage_dim=[256, 512, 1024]) + + else: + print('not a valid hybrid model') + return -1 + + return model diff --git a/PuzzleTuning/Backbone/Transformer_blocks.py b/PuzzleTuning/Backbone/Transformer_blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..ff1bcf9965b10a3df635349b217d3c4590f1fa48 --- /dev/null +++ b/PuzzleTuning/Backbone/Transformer_blocks.py @@ -0,0 +1,1631 @@ +""" +Transformer blocks script ver: OCT 28th 15:00 + +bug fix: 'Cross-attn' name is used in MHGA for compareability + +by the authors, check our github page: +https://github.com/sagizty/Multi-Stage-Hybrid-Transformer + +based on:timm +https://www.freeaihub.com/post/94067.html + +""" + +import math +import logging +from functools import partial +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import numpy as np + +from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD + +from timm.models.layers import StdConv2dSame, DropPath, to_2tuple, trunc_normal_ + +from .attention_modules import simam_module, cbam_module, se_module + + +class FFN(nn.Module): # Mlp from timm + """ + FFN (from timm) + + :param in_features: + :param hidden_features: + :param out_features: + :param act_layer: + :param drop: + """ + + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + + x = self.fc2(x) + x = self.drop(x) + + return x + + +class Attention(nn.Module): # qkv Transform + MSA(MHSA) (Attention from timm) + """ + qkv Transform + MSA(MHSA) (from timm) + + # input x.shape = batch, patch_number, patch_dim + # output x.shape = batch, patch_number, patch_dim + + :param dim: dim=CNN feature dim, because the patch size is 1x1 + :param num_heads: + :param qkv_bias: + :param qk_scale: by default head_dim ** -0.5 (squre root) + :param attn_drop: dropout rate after MHSA + :param proj_drop: + + """ + + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + # input x.shape = batch, patch_number, patch_dim + batch, patch_number, patch_dim = x.shape + + # mlp transform + head split [N, P, D] -> [N, P, 3D] -> [N, P, 3, H, D/H] -> [3, N, H, P, D/H] + qkv = self.qkv(x).reshape(batch, patch_number, 3, self.num_heads, patch_dim // + self.num_heads).permute(2, 0, 3, 1, 4) + # 3 [N, H, P, D/H] + q, k, v = qkv[0], qkv[1], qkv[2] + + # [N, H, P, D/H] -> [N, H, P, D/H] + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + + attn = self.attn_drop(attn) # Dropout + + # head fusion [N, H, P, D/H] -> [N, P, H, D/H] -> [N, P, D] + x = (attn @ v).transpose(1, 2).reshape(batch, patch_number, patch_dim) + + x = self.proj(x) + x = self.proj_drop(x) # mlp + + # output x.shape = batch, patch_number, patch_dim + return x + + +class Encoder_Block(nn.Module): # teansformer Block from timm + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + """ + # input x.shape = batch, patch_number, patch_dim + # output x.shape = batch, patch_number, patch_dim + + :param dim: dim + :param num_heads: + :param mlp_ratio: FFN + :param qkv_bias: + :param qk_scale: by default head_dim ** -0.5 (squre root) + :param drop: + :param attn_drop: dropout rate after Attention + :param drop_path: dropout rate after sd + :param act_layer: FFN act + :param norm_layer: Pre Norm + """ + super().__init__() + # Pre Norm + self.norm1 = norm_layer(dim) # Transformer used the nn.LayerNorm + self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, + proj_drop=drop) + # NOTE from timm: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() # stochastic depth + + # Add & Norm + self.norm2 = norm_layer(dim) + + # FFN + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = FFN(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class Guided_Attention(nn.Module): # q1 k1 v0 Transform + MSA(MHSA) (based on timm Attention) + """ + notice the q abd k is guided information from Focus module + qkv Transform + MSA(MHSA) (from timm) + + # 3 input of x.shape = batch, patch_number, patch_dim + # 1 output of x.shape = batch, patch_number, patch_dim + + :param dim: dim = CNN feature dim, because the patch size is 1x1 + :param num_heads: + :param qkv_bias: + :param qk_scale: by default head_dim ** -0.5 (squre root) + :param attn_drop: + :param proj_drop: + + """ + + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.qT = nn.Linear(dim, dim, bias=qkv_bias) + self.kT = nn.Linear(dim, dim, bias=qkv_bias) + self.vT = nn.Linear(dim, dim, bias=qkv_bias) + + self.attn_drop = nn.Dropout(attn_drop) + + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, q_encoder, k_encoder, v_input): + # 3 input of x.shape = batch, patch_number, patch_dim + batch, patch_number, patch_dim = v_input.shape + + q = self.qT(q_encoder).reshape(batch, patch_number, 1, self.num_heads, + patch_dim // self.num_heads).permute(2, 0, 3, 1, 4) + k = self.kT(k_encoder).reshape(batch, patch_number, 1, self.num_heads, + patch_dim // self.num_heads).permute(2, 0, 3, 1, 4) + v = self.vT(v_input).reshape(batch, patch_number, 1, self.num_heads, + patch_dim // self.num_heads).permute(2, 0, 3, 1, 4) + q = q[0] + k = k[0] + v = v[0] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + + attn = self.attn_drop(attn) # Dropout + + x = (attn @ v).transpose(1, 2).reshape(batch, patch_number, patch_dim) + + x = self.proj(x) + x = self.proj_drop(x) # mlp Dropout + + # output of x.shape = batch, patch_number, patch_dim + return x + + +class Decoder_Block(nn.Module): + # FGD Decoder (Transformer encoder + Guided Attention block block) + def __init__(self, dim, num_heads=8, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + """ + # input x.shape = batch, patch_number, patch_dim + # output x.shape = batch, patch_number, patch_dim + + :param dim: dim=CNN feature dim, because the patch size is 1x1 + :param num_heads: multi-head + :param mlp_ratio: FFN expand ratio + :param qkv_bias: qkv MLP bias + :param qk_scale: by default head_dim ** -0.5 (squre root) + :param drop: the MLP after MHSA equipt a dropout rate + :param attn_drop: dropout rate after attention block + :param drop_path: dropout rate for stochastic depth + :param act_layer: FFN act + :param norm_layer: Pre Norm strategy with norm layer + """ + super().__init__() + # Pre Norm + self.norm0 = norm_layer(dim) # nn.LayerNorm + self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, + proj_drop=drop) + # stochastic depth + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + # Pre Norm + self.norm1 = norm_layer(dim) + + # FFN1 + mlp_hidden_dim = int(dim * mlp_ratio) + self.FFN1 = FFN(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + # Guided_Attention + self.Cross_attn = Guided_Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop) + + # Add & Norm + self.norm2 = norm_layer(dim) + # FFN2 + self.FFN2 = FFN(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + # Add & Norm + self.norm3 = norm_layer(dim) + + def forward(self, q_encoder, k_encoder, v_input): + v_self = v_input + self.drop_path(self.attn(self.norm0(v_input))) + + v_self = v_self + self.drop_path(self.FFN1(self.norm1(v_self))) + + # norm layer for v only, the normalization of q and k is inside FGD Focus block + v_self = v_self + self.drop_path(self.Cross_attn(q_encoder, k_encoder, self.norm2(v_self))) + + v_self = v_self + self.drop_path(self.FFN2(self.norm3(v_self))) + + return v_self + + +''' +# testing example + +model=Decoder_Block(dim=768) +k = torch.randn(7, 49, 768) +q = torch.randn(7, 49, 768) +v = torch.randn(7, 49, 768) +x = model(k,q,v) +print(x.shape) +''' + + +# MViT modules +# from https://github.com/facebookresearch/SlowFast/slowfast/models/attention.py +def attention_pool(tensor, pool, thw_shape, has_cls_embed=True, norm=None): + """ + attention pooling constructor + + input: + tensor of (B, Head, N, C) or (B, N, C) + thw_shape: T, H, W 对应CNN的特征图形状(2D形状)T is video frams + + numpy.prob(T, H, W) == N(Num_patches) - 1 (cls token if it is there) + + output: + tensor of (B, Head, N_O, C) or (B, N_O, C) + thw_shape: T_O, H_O, W_O + + :param tensor: input feature patches + :param pool: pooling/conv layer + :param thw_shape: reconstruction feature map shape + :param has_cls_embed: if cls token is used + :param norm: norm layer + + """ + if pool is None: # no pool + return tensor, thw_shape + + tensor_dim = tensor.ndim + + # fix dim: [B, Head, N, C] + # N is Num_patches in Transformer modeling + + if tensor_dim == 4: + pass + elif tensor_dim == 3: # [B, N, C] -> [B, Head(1), N, C] + tensor = tensor.unsqueeze(1) + else: + raise NotImplementedError(f"Unsupported input dimension {tensor.shape}") + + if has_cls_embed: + cls_tok, tensor = tensor[:, :, :1, :], tensor[:, :, 1:, :] + + B, Head, N, C = tensor.shape + T, H, W = thw_shape # numpy.prob(T, H, W) == N(Num_patches) - 1 (cls token if it is there) + + # [B, Head, N, C] -> [B * Head, T, H, W, C] -> [B * Head, C, T, H, W] + tensor = (tensor.reshape(B * Head, T, H, W, C).permute(0, 4, 1, 2, 3).contiguous()) + # use tensor.contiguous() to matain its memory location + + # [B * Head, C, T, H, W] -> [B * Head, C, T_O, H_O, W_O] + tensor = pool(tensor) # 3D Pooling/ 3D Conv + + # output T, H, W + thw_shape = [tensor.shape[2], tensor.shape[3], tensor.shape[4]] + # output Num_patches: numpy.prob(T, H, W) + N_pooled = tensor.shape[2] * tensor.shape[3] * tensor.shape[4] + + # [B * Head, C, T_O, H_O, W_O] -> [B, Head, C, N_O(T_O*H_O*W_O)] -> [B, Head, N_O, C] + tensor = tensor.reshape(B, Head, C, N_pooled).transpose(2, 3) + + if has_cls_embed: + # [B, Head, N_O, C] -> [B, Head, N_O+1(cls token), C] + tensor = torch.cat((cls_tok, tensor), dim=2) + + # norm + if norm is not None: + tensor = norm(tensor) + + # Assert tensor_dim in [3, 4] + if tensor_dim == 4: # [B, Head, N_O, C] multi-head + pass + else: # tensor_dim == 3: this is a single Head + tensor = tensor.squeeze(1) # [B, N_O, C] + + return tensor, thw_shape + + +''' +# case 1 single-head no pooling scale +x = torch.randn(1, 197, 768) +thw_shape = [1, 14, 14] +pool = nn.MaxPool3d((1, 1, 1), (1, 1, 1), (0, 0, 0), ceil_mode=False) +y, thw = attention_pool(x, pool, thw_shape) + +print(y.shape) # torch.Size([1, 197, 768]) +print(thw) # [1, 14, 14] + + +# case 2 multi-head no pooling scale +x = torch.randn(1, 8, 197, 96) # [B, Head, N_O, C] multi-head +thw_shape = [1, 14, 14] +pool = nn.MaxPool3d((1, 1, 1), (1, 1, 1), (0, 0, 0), ceil_mode=False) +y, thw = attention_pool(x, pool, thw_shape) + +print(y.shape) # torch.Size([1, 8, 197, 96]) +print(thw) # [1, 14, 14] + + +# case 3 pooling scale +x = torch.randn(1, 197, 768) +thw_shape = [1, 14, 14] +pool = nn.MaxPool3d((1, 2, 2), (1, 2, 2), (0, 0, 0), ceil_mode=False) +y, thw = attention_pool(x, pool, thw_shape) + +print(y.shape) # torch.Size([1, 50, 768]) +print(thw) # [1, 7, 7] + + +# case 4 multi-head pooling scale +x = torch.randn(1, 8, 197, 96) # [B, Head, N_O, C] multi-head +thw_shape = [1, 14, 14] +pool = nn.MaxPool3d((1, 2, 2), (1, 2, 2), (0, 0, 0), ceil_mode=False) +y, thw = attention_pool(x, pool, thw_shape) + +print(y.shape) # torch.Size([1, 8, 50, 96]) +print(thw) # [1, 7, 7] +''' + + +class MultiScaleAttention(nn.Module): # Attention module + """ + Attention module constructor + + input: + tensor of (B, N, C) + thw_shape: T, H, W 对应CNN的特征图形状(2D形状)T is video frams + + numpy.prob(T, H, W) == N(Num_patches) - 1 (cls token if it is there) + + output: + tensor of (B, N_O, C) + thw_shape: T_O, H_O, W_O + + :param dim: Transformer feature dim + :param num_heads: Transformer heads + :param qkv_bias: projecting bias + :param drop_rate: dropout rate after attention calculation and mlp + + :param kernel_q: pooling kernal size for q + :param kernel_kv: pooling kernal size for k and v + :param stride_q: pooling kernal stride for q + :param stride_kv: pooling kernal stride for k and v + + :param norm_layer: norm layer + :param has_cls_embed: if cls token is used + :param mode: mode for attention pooling(downsampling) Options include `conv`, `avg`, and `max`. + :param pool_first: process pooling(downsampling) before liner projecting + + """ + + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + drop_rate=0.0, + kernel_q=(1, 1, 1), + kernel_kv=(1, 1, 1), + stride_q=(1, 1, 1), + stride_kv=(1, 1, 1), + norm_layer=nn.LayerNorm, + has_cls_embed=True, + # Options include `conv`, `avg`, and `max`. + mode="conv", + # If True, perform pool before projection. + pool_first=False, + ): + super().__init__() + + self.pool_first = pool_first + self.drop_rate = drop_rate + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 # squre root + self.has_cls_embed = has_cls_embed + + padding_q = [int(q // 2) for q in kernel_q] # 以半个kernal size进行padding,向下取整 + padding_kv = [int(kv // 2) for kv in kernel_kv] + + # projecting mlp + self.q = nn.Linear(dim, dim, bias=qkv_bias) + self.k = nn.Linear(dim, dim, bias=qkv_bias) + self.v = nn.Linear(dim, dim, bias=qkv_bias) + self.proj = nn.Linear(dim, dim) + + if drop_rate > 0.0: + self.proj_drop = nn.Dropout(drop_rate) + + # Skip pooling with kernel and stride size of (1, 1, 1). + if np.prod(kernel_q) == 1 and np.prod(stride_q) == 1: + kernel_q = () # clear + if np.prod(kernel_kv) == 1 and np.prod(stride_kv) == 1: + kernel_kv = () + + if mode in ("avg", "max"): # use nn.MaxPool3d or nn.AvgPool3d + pool_op = nn.MaxPool3d if mode == "max" else nn.AvgPool3d + self.pool_q = ( + pool_op(kernel_q, stride_q, padding_q, ceil_mode=False) + if len(kernel_q) > 0 + else None # Skip pooling if kernel is cleared + ) + self.pool_k = ( + pool_op(kernel_kv, stride_kv, padding_kv, ceil_mode=False) + if len(kernel_kv) > 0 + else None + ) + self.pool_v = ( + pool_op(kernel_kv, stride_kv, padding_kv, ceil_mode=False) + if len(kernel_kv) > 0 + else None + ) + + elif mode == "conv": # use nn.Conv3d with depth wise conv and fixed channel setting + self.pool_q = ( + nn.Conv3d( + head_dim, + head_dim, + kernel_q, + stride=stride_q, + padding=padding_q, + groups=head_dim, + bias=False, + ) + if len(kernel_q) > 0 + else None + ) + self.norm_q = norm_layer(head_dim) if len(kernel_q) > 0 else None + + self.pool_k = ( + nn.Conv3d( + head_dim, + head_dim, + kernel_kv, + stride=stride_kv, + padding=padding_kv, + groups=head_dim, + bias=False, + ) + if len(kernel_kv) > 0 + else None + ) + self.norm_k = norm_layer(head_dim) if len(kernel_kv) > 0 else None + + self.pool_v = ( + nn.Conv3d( + head_dim, + head_dim, + kernel_kv, + stride=stride_kv, + padding=padding_kv, + groups=head_dim, + bias=False, + ) + if len(kernel_kv) > 0 + else None + ) + self.norm_v = norm_layer(head_dim) if len(kernel_kv) > 0 else None + else: + raise NotImplementedError(f"Unsupported model {mode}") + + def forward(self, x, thw_shape): + """ + x: Transformer feature patches + thw_shape: reconstruction feature map shape + """ + + B, N, C = x.shape + + # step 1: duplicate projecting + head split: [B, N, C] -> [B, H, N, C/H] + + if self.pool_first: # step a.1 embedding + # head split [B, N, C] -> [B, N, H, C/H] -> [B, H, N, C/H] + x = x.reshape(B, N, self.num_heads, C // self.num_heads).permute( + 0, 2, 1, 3 + ) + q = k = v = x + + else: # step b.1 projecting first + # mlp transform + head split: [B, N, C] -> [B, N, H, C/H] -> [B, H, N, C/H] + # todo 这里我觉得可能共享mlp映射更好,能有更好的交互,但是分离mlp更节约计算量 + q = k = v = x + q = ( + self.q(q) + .reshape(B, N, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + k = ( + self.k(k) + .reshape(B, N, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + v = ( + self.v(v) + .reshape(B, N, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + + # step 2: calculate attention_pool feature sequence and its shape + # [B, H, N0, C/H] -> [B, H, N1, C/H] + q, q_shape = attention_pool( + q, + self.pool_q, + thw_shape, + has_cls_embed=self.has_cls_embed, + norm=self.norm_q if hasattr(self, "norm_q") else None, + ) + k, k_shape = attention_pool( + k, + self.pool_k, + thw_shape, + has_cls_embed=self.has_cls_embed, + norm=self.norm_k if hasattr(self, "norm_k") else None, + ) + v, v_shape = attention_pool( + v, + self.pool_v, + thw_shape, + has_cls_embed=self.has_cls_embed, + norm=self.norm_v if hasattr(self, "norm_v") else None, + ) + + if self.pool_first: # step a.3 MLP projecting + # calculate patch number, q_N, k_N, v_N + q_N = ( + np.prod(q_shape) + 1 + if self.has_cls_embed + else np.prod(q_shape) + ) + k_N = ( + np.prod(k_shape) + 1 + if self.has_cls_embed + else np.prod(k_shape) + ) + v_N = ( + np.prod(v_shape) + 1 + if self.has_cls_embed + else np.prod(v_shape) + ) + + # [B, H, N1, C/H] -> [B, N1, H, C/H] -> [B, N1, C] -> MLP + # -> [B, N1, C] -> [B, N1, H, C/H] -> [B, H, N1, C/H] + q = q.permute(0, 2, 1, 3).reshape(B, q_N, C) + q = ( + self.q(q) + .reshape(B, q_N, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + + v = v.permute(0, 2, 1, 3).reshape(B, v_N, C) + v = ( + self.v(v) + .reshape(B, v_N, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + + k = k.permute(0, 2, 1, 3).reshape(B, k_N, C) + k = ( + self.k(k) + .reshape(B, k_N, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + + # step 3: attention calculation + # multi-head self attention [B, H, N1, C/H] -> [B, H, N1, C/H] + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + + # head squeeze [B, H, N1, C/H] -> [B, N1, H, C/H] -> [B, N1, C] + N = q.shape[2] + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + + # step 4: mlp stablization and dropout [B, N1, C] -> [B, N1, C] + x = self.proj(x) + if self.drop_rate > 0.0: + x = self.proj_drop(x) + + return x, q_shape + + +''' +# case 1 +model = MultiScaleAttention(768) +x = torch.randn(1, 197, 768) +y, thw = model(x, [1, 14, 14]) +print(y.shape) + + +# case 2 +kernel_q = (1, 2, 2) +kernel_kv = (1, 2, 2) +stride_q = (1, 2, 2) +stride_kv = (1, 2, 2) +# MultiScaleAttention 中设计以半个kernal size进行padding,向下取整 + +model = MultiScaleAttention(768, kernel_q=kernel_q, kernel_kv=kernel_kv, stride_q=stride_q, stride_kv=stride_kv) +x = torch.randn(1, 197, 768) +y, thw = model(x, [1, 14, 14]) + +print(y.shape) # 输出torch.Size([1, 65, 768]):不padding是7*7 由于padding变成8*8, 之后加上cls token +''' + + +class MultiScaleBlock(nn.Module): # MViT Encoder + """ + Attention module constructor + + input: + tensor of (B, N, C) + thw_shape: T, H, W 对应CNN的特征图形状(2D形状)T is video frams + + numpy.prob(T, H, W) == N(Num_patches) - 1 (cls token if it is there) + + output: + tensor of (B, N_O, C) + thw_shape: T_O, H_O, W_O + + :param dim: Transformer feature dim + :param dim_out: + + :param num_heads: Transformer heads + :param mlp_ratio: FFN hidden expansion + :param qkv_bias: projecting bias + :param drop_rate: dropout rate after attention calculation and mlp + :param drop_path: dropout rate for SD + :param act_layer: FFN act + :param norm_layer: Pre Norm + + :param up_rate: + :param kernel_q: pooling kernal size for q + :param kernel_kv: pooling kernal size for k and v + :param stride_q: pooling kernal stride for q + :param stride_kv: pooling kernal stride for k and v + + :param has_cls_embed: if cls token is used + :param mode: mode for attention pooling(downsampling) Options include `conv`, `avg`, and `max`. + :param pool_first: process pooling(downsampling) before liner projecting + + """ + + def __init__( + self, + dim, + dim_out, + num_heads=8, + mlp_ratio=4.0, + qkv_bias=False, + drop_rate=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + up_rate=None, + kernel_q=(1, 1, 1), + kernel_kv=(1, 1, 1), + stride_q=(1, 1, 1), + stride_kv=(1, 1, 1), + has_cls_embed=True, + mode="conv", + pool_first=False, + ): + super().__init__() + + self.has_cls_embed = has_cls_embed + + # step 1: Attention projecting + self.dim = dim + self.dim_out = dim_out + self.norm1 = norm_layer(dim) # pre-norm + + self.attn = MultiScaleAttention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + drop_rate=drop_rate, + kernel_q=kernel_q, + kernel_kv=kernel_kv, + stride_q=stride_q, + stride_kv=stride_kv, + norm_layer=nn.LayerNorm, + has_cls_embed=self.has_cls_embed, + mode=mode, + pool_first=pool_first, + ) + + self.drop_path = (DropPath(drop_path) if drop_path > 0.0 else nn.Identity()) + + # residual connection for Attention projecting + kernel_skip = kernel_q # fixme ori: [s + 1 if s > 1 else s for s in stride_q] + stride_skip = stride_q + padding_skip = [int(skip // 2) for skip in kernel_skip] # 以半个kernal size进行padding,向下取整 + + self.pool_skip = ( + nn.MaxPool3d(kernel_skip, stride_skip, padding_skip, ceil_mode=False) + if len(kernel_skip) > 0 + else None) + + self.norm2 = norm_layer(dim) # pre-norm + + # step 2: FFN projecting + mlp_hidden_dim = int(dim * mlp_ratio) + + # here use FFN to encode feature into abstractive information in the dimension + # TODO: check the use case for up_rate, and merge the following lines + if up_rate is not None and up_rate > 1: + mlp_dim_out = dim * up_rate + else: + mlp_dim_out = dim_out + + self.mlp = FFN( + in_features=dim, + hidden_features=mlp_hidden_dim, + out_features=mlp_dim_out, + act_layer=act_layer, + drop=drop_rate, + ) + + # residual connection for FFN projecting + if dim != dim_out: + self.proj = nn.Linear(dim, dim_out) + + def forward(self, x, thw_shape): + # step 1: Attention projecting + x_block, thw_shape_new = self.attn(self.norm1(x), thw_shape) + # residual connection for Attention projecting + x_res, _ = attention_pool(x, self.pool_skip, thw_shape, has_cls_embed=self.has_cls_embed) + x = x_res + self.drop_path(x_block) + + # step 2: FFN projecting + x_norm = self.norm2(x) + x_mlp = self.mlp(x_norm) + # residual connection for FFN projecting + if self.dim != self.dim_out: + x = self.proj(x_norm) + x = x + self.drop_path(x_mlp) + + return x, thw_shape_new + + +''' +# case 1 +model = MultiScaleBlock(768,1024) +x = torch.randn(1, 197, 768) +y, thw = model(x, [1, 14, 14]) +print(y.shape) # torch.Size([1, 197, 1024]) + + +# case 2 +kernel_q = (1, 2, 2) +kernel_kv = (1, 2, 2) +stride_q = (1, 2, 2) +stride_kv = (1, 2, 2) +# MultiScaleAttention 中设计以半个kernal size进行padding,向下取整 + +model = MultiScaleBlock(768, 1024, kernel_q=kernel_q, kernel_kv=kernel_kv, stride_q=stride_q, stride_kv=stride_kv) +x = torch.randn(1, 197, 768) +y, thw = model(x, [1, 14, 14]) + +print(y.shape) # 输出torch.Size([1, 65, 1024]):不padding是7*7 由于padding变成8*8, 之后加上cls token +''' + + +class PatchEmbed(nn.Module): # PatchEmbed from timm + """ + Image to Patch Embedding + """ + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).transpose(1, 2) + + # x: (B, 14*14, 768) + return x + + +class Hybrid_feature_map_Embed(nn.Module): # HybridEmbed from timm + """ + CNN Feature Map Embedding, required backbone which is just for referance here + Extract feature map from CNN, flatten, project to embedding dim. + + # input x.shape = batch, feature_dim, feature_size[0], feature_size[1] + # output x.shape = batch, patch_number, patch_dim + """ + + def __init__(self, backbone, img_size=224, patch_size=1, feature_size=None, feature_dim=None, + in_chans=3, embed_dim=768): + super().__init__() + + assert isinstance(backbone, nn.Module) + + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.backbone = backbone + + if feature_size is None or feature_dim is None: # backbone output feature_size + with torch.no_grad(): + # NOTE Most reliable way of determining output dims is to run forward pass + training = backbone.training + if training: + backbone.eval() + o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1])) + if isinstance(o, (list, tuple)): + o = o[-1] # last feature if backbone outputs list/tuple of features + feature_size = o.shape[-2:] + feature_dim = o.shape[1] + backbone.train(training) + else: + feature_size = to_2tuple(feature_size) + ''' + if hasattr(self.backbone, 'feature_info'): + feature_dim = self.backbone.feature_info.channels()[-1] + else: + feature_dim = self.backbone.num_features + ''' + + assert feature_size[0] % patch_size[0] == 0 and feature_size[1] % patch_size[1] == 0 + + self.grid_size = (feature_size[0] // patch_size[0], feature_size[1] // patch_size[1]) # patchlize + + self.num_patches = self.grid_size[0] * self.grid_size[1] + + self.proj = nn.Conv2d(in_channels=feature_dim, out_channels=embed_dim, + kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + x = self.backbone(x) + if isinstance(x, (list, tuple)): + x = x[-1] # last feature if backbone outputs list/tuple of features + + x = self.proj(x).flatten(2).transpose(1, 2) # shape = ( ) + """ + x.shape: batch, feature_dim, feature_size[0], feature_size[1] + proj(x).shape: batch, embed_dim, patch_height_num, patch_width_num + flatten(2).shape: batch, embed_dim, patch_num + .transpose(1, 2).shape: batch feature_patch_number feature_patch_dim + """ + # output: x.shape = batch, patch_number, patch_dim + return x + + +class Last_feature_map_Embed(nn.Module): + """ + use this block to connect last CNN stage to the first Transformer block + Extract feature map from CNN, flatten, project to embedding dim. + + # input x.shape = batch, feature_dim, feature_size[0], feature_size[1] + # output x.shape = batch, patch_number, patch_dim + """ + + def __init__(self, patch_size=1, feature_size=(7, 7), feature_dim=2048, embed_dim=768, + Attention_module=None): + super().__init__() + + # Attention module + if Attention_module is not None: + if Attention_module == 'SimAM': + self.Attention_module = simam_module(e_lambda=1e-4) + elif Attention_module == 'CBAM': + self.Attention_module = cbam_module(gate_channels=feature_dim) + elif Attention_module == 'SE': + self.Attention_module = se_module(channel=feature_dim) + else: + self.Attention_module = None + + patch_size = to_2tuple(patch_size) + self.patch_size = patch_size + + feature_size = to_2tuple(feature_size) + + # feature map should be matching the size + assert feature_size[0] % self.patch_size[0] == 0 and feature_size[1] % self.patch_size[1] == 0 + + self.grid_size = (feature_size[0] // self.patch_size[0], feature_size[1] // self.patch_size[1]) # patch + + self.num_patches = self.grid_size[0] * self.grid_size[1] + + # use the conv to split the patch by the following design: + self.proj = nn.Conv2d(in_channels=feature_dim, out_channels=embed_dim, + kernel_size=self.patch_size, stride=self.patch_size) + + def forward(self, x): + if self.Attention_module is not None: + x = self.Attention_module(x) + + if isinstance(x, (list, tuple)): + x = x[-1] # last feature if backbone outputs list/tuple of features + + x = self.proj(x).flatten(2).transpose(1, 2) + """ + x.shape: batch, feature_dim, feature_size[0], feature_size[1] + proj(x).shape: batch, embed_dim, patch_height_num, patch_width_num + flatten(2).shape: batch, embed_dim, patch_num + .transpose(1, 2).shape: batch feature_patch_number feature_patch_dim + """ + # output 格式 x.shape = batch, patch_number, patch_dim + return x + + +class Focus_Embed(nn.Module): # Attention guided module for hybridzing the early stages CNN feature + """ + FGD Focus module + Extract feature map from CNN, flatten, project to embedding dim. and use them as attention guidance + + input: x.shape = batch, feature_dim, feature_size[0], feature_size[1] + + Firstly, an attention block will be used to stable the feature projecting process + + Secondly, for each feature map,the focus will be 2 path: gaze and glance + in gaze path Max pool will be applied to get prominent information + in glance path Avg pool will be applied to get general information + + after the dual pooling path 2 seperate CNNs will be used to project the dimension + Finally, flattern and transpose will be applied + + output 2 attention guidance: gaze, glance + x.shape = batch, patch_number, patch_dim + + + ref: + ResNet50's feature map from different stages (edge size of 224) + stage 1 output feature map: torch.Size([b, 256, 56, 56]) + stage 2 output feature map: torch.Size([b, 512, 28, 28]) + stage 3 output feature map: torch.Size([b, 1024, 14, 14]) + stage 4 output feature map: torch.Size([b, 2048, 7, 7]) + """ + + def __init__(self, patch_size=1, target_feature_size=(7, 7), feature_size=(56, 56), feature_dim=256, embed_dim=768, + Attention_module=None, norm_layer=nn.LayerNorm): + super().__init__() + patch_size = to_2tuple(patch_size) + feature_size = to_2tuple(feature_size) # patch size of the current feature map + + target_feature_size = to_2tuple(target_feature_size) # patch size of the last feature map + + # cheak feature map can be patchlize to target_feature_size + assert feature_size[0] % target_feature_size[0] == 0 and feature_size[1] % target_feature_size[1] == 0 + + # cheak target_feature map can be patchlize to patch + assert target_feature_size[0] % patch_size[0] == 0 and target_feature_size[1] % patch_size[1] == 0 + + # Attention block + if Attention_module is not None: + if Attention_module == 'SimAM': + self.Attention_module = simam_module(e_lambda=1e-4) + elif Attention_module == 'CBAM': + self.Attention_module = cbam_module(gate_channels=feature_dim) + elif Attention_module == 'SE': + self.Attention_module = se_module(channel=feature_dim) + else: + self.Attention_module = None + + # split focus ROI + self.focus_size = (feature_size[0] // target_feature_size[0], feature_size[1] // target_feature_size[1]) + self.num_focus = self.focus_size[0] * self.focus_size[1] + # by kernel_size=focus_size, stride=focus_size design + # output_size=target_feature_size=7x7 so as to match the minist feature map + + self.gaze = nn.MaxPool2d(self.focus_size, stride=self.focus_size) + self.glance = nn.AvgPool2d(self.focus_size, stride=self.focus_size) + # x.shape: batch, feature_dim, target_feature_size[0], target_feature_size[1] + + # split patch + self.grid_size = (target_feature_size[0] // patch_size[0], target_feature_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + + # use CNN to project dim to patch_dim + self.gaze_proj = nn.Conv2d(in_channels=feature_dim, out_channels=embed_dim, + kernel_size=patch_size, stride=patch_size) + self.glance_proj = nn.Conv2d(in_channels=feature_dim, out_channels=embed_dim, + kernel_size=patch_size, stride=patch_size) + + self.norm_q = norm_layer(embed_dim) # Transformer nn.LayerNorm + self.norm_k = norm_layer(embed_dim) # Transformer nn.LayerNorm + + def forward(self, x): + if self.Attention_module is not None: + x = self.Attention_module(x) + + if isinstance(x, (list, tuple)): + x = x[-1] # last feature if backbone outputs list/tuple of features + + q = self.norm_q(self.gaze_proj(self.gaze(x)).flatten(2).transpose(1, 2)) + k = self.norm_k(self.glance_proj(self.glance(x)).flatten(2).transpose(1, 2)) + """ + x.shape: batch, feature_dim, feature_size[0], feature_size[1] + gaze/glance(x).shape: batch, feature_dim, target_feature_size[0], target_feature_size[1] + proj(x).shape: batch, embed_dim, patch_height_num, patch_width_num + flatten(2).shape: batch, embed_dim, patch_num + .transpose(1, 2).shape: batch feature_patch_number feature_patch_dim + """ + # output x.shape = batch, patch_number, patch_dim + return q, k + + +''' +# test sample +model = Focus_Embed() +x = torch.randn(4, 256, 56, 56) +y1,y2 = model(x) +print(y1.shape) +print(y2.shape) +''' + + +class Focus_SEmbed(nn.Module): # Attention guided module for hybridzing the early stages CNN feature + """ + + self focus (q=k) based on FGD Focus block + + Extract feature map from CNN, flatten, project to embedding dim. and use them as attention guidance + + input: x.shape = batch, feature_dim, feature_size[0], feature_size[1] + + Firstly, an attention block will be used to stable the feature projecting process + + Secondly, for each feature map,the focus will be 1 path: glance + in glance path Avg pool will be applied to get general information + + after the pooling process 1 CNN will be used to project the dimension + Finally, flattern and transpose will be applied + + output 2 attention guidance: glance, glance + x.shape = batch, patch_number, patch_dim + """ + + def __init__(self, patch_size=1, target_feature_size=(7, 7), feature_size=(56, 56), feature_dim=256, embed_dim=768, + Attention_module=None, norm_layer=nn.LayerNorm): + super().__init__() + patch_size = to_2tuple(patch_size) + feature_size = to_2tuple(feature_size) + + target_feature_size = to_2tuple(target_feature_size) + + assert feature_size[0] % target_feature_size[0] == 0 and feature_size[1] % target_feature_size[1] == 0 + + assert target_feature_size[0] % patch_size[0] == 0 and target_feature_size[1] % patch_size[1] == 0 + + if Attention_module is not None: + if Attention_module == 'SimAM': + self.Attention_module = simam_module(e_lambda=1e-4) + elif Attention_module == 'CBAM': + self.Attention_module = cbam_module(gate_channels=feature_dim) + elif Attention_module == 'SE': + self.Attention_module = se_module(channel=feature_dim) + else: + self.Attention_module = None + + self.focus_size = (feature_size[0] // target_feature_size[0], feature_size[1] // target_feature_size[1]) + self.num_focus = self.focus_size[0] * self.focus_size[1] + + self.gaze = nn.MaxPool2d(self.focus_size, stride=self.focus_size) + + self.grid_size = (target_feature_size[0] // patch_size[0], target_feature_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + + self.proj = nn.Conv2d(in_channels=feature_dim, out_channels=embed_dim, kernel_size=patch_size, + stride=patch_size) + + self.norm_f = norm_layer(embed_dim) + + def forward(self, x): + if self.Attention_module is not None: + x = self.Attention_module(x) + + if isinstance(x, (list, tuple)): + x = x[-1] # last feature if backbone outputs list/tuple of features + + q = self.norm_f(self.proj(self.gaze(x)).flatten(2).transpose(1, 2)) + k = q + """ + x.shape: batch, feature_dim, feature_size[0], feature_size[1] + gaze/glance(x).shape: batch, feature_dim, target_feature_size[0], target_feature_size[1] + proj(x).shape: batch, embed_dim, patch_height_num, patch_width_num + flatten(2).shape: batch, embed_dim, patch_num + .transpose(1, 2).shape: batch feature_patch_number feature_patch_dim + """ + # output x.shape = batch, patch_number, patch_dim + return q, k + + +class Focus_Aggressive(nn.Module): # Attention guided module for hybridzing the early stages CNN feature + """ + Aggressive CNN Focus based on FGD Focus block + + Extract feature map from CNN, flatten, project to embedding dim. and use them as attention guidance + + input: x.shape = batch, feature_dim, feature_size[0], feature_size[1] + + Firstly, an attention block will be used to stable the feature projecting process + + Secondly, 2 CNNs will be used to project the dimension + + Finally, flattern and transpose will be applied + + output 2 attention guidance: gaze, glance + x.shape = batch, patch_number, patch_dim + + """ + + def __init__(self, patch_size=1, target_feature_size=(7, 7), feature_size=(56, 56), feature_dim=256, embed_dim=768, + Attention_module=None, norm_layer=nn.LayerNorm): + super().__init__() + patch_size = to_2tuple(patch_size) # patch size of the last feature map + feature_size = to_2tuple(feature_size) + + target_feature_size = to_2tuple(target_feature_size) + + assert feature_size[0] % target_feature_size[0] == 0 and feature_size[1] % target_feature_size[1] == 0 + + assert target_feature_size[0] % patch_size[0] == 0 and target_feature_size[1] % patch_size[1] == 0 + + if Attention_module is not None: + if Attention_module == 'SimAM': + self.Attention_module = simam_module(e_lambda=1e-4) + elif Attention_module == 'CBAM': + self.Attention_module = cbam_module(gate_channels=feature_dim) + elif Attention_module == 'SE': + self.Attention_module = se_module(channel=feature_dim) + else: + self.Attention_module = None + + self.focus_size = (feature_size[0] // target_feature_size[0], feature_size[1] // target_feature_size[1]) + + self.grid_size = (self.focus_size[0] * patch_size[0], self.focus_size[1] * patch_size[1]) + self.num_patches = (feature_size[0] // self.grid_size[0]) * (feature_size[1] // self.grid_size[1]) + + self.gaze_proj = nn.Conv2d(in_channels=feature_dim, out_channels=embed_dim, + kernel_size=self.grid_size, stride=self.grid_size) + self.glance_proj = nn.Conv2d(in_channels=feature_dim, out_channels=embed_dim, + kernel_size=self.grid_size, stride=self.grid_size) + + self.norm_q = norm_layer(embed_dim) + self.norm_k = norm_layer(embed_dim) + + def forward(self, x): + if self.Attention_module is not None: + x = self.Attention_module(x) + + if isinstance(x, (list, tuple)): + x = x[-1] # last feature if backbone outputs list/tuple of features + + q = self.norm_q(self.gaze_proj(x).flatten(2).transpose(1, 2)) + k = self.norm_k(self.glance_proj(x).flatten(2).transpose(1, 2)) + """ + x.shape: batch, feature_dim, feature_size[0], feature_size[1] + proj(x).shape: batch, embed_dim, patch_height_num, patch_width_num + flatten(2).shape: batch, embed_dim, patch_num + .transpose(1, 2).shape: batch feature_patch_number feature_patch_dim + """ + # output x.shape = batch, patch_number, patch_dim + return q, k + + +class Focus_SAggressive(nn.Module): # Attention guided module for hybridzing the early stages CNN feature + """ + Aggressive CNN self Focus + Extract feature map from CNN, flatten, project to embedding dim. and use them as attention guidance + + input: x.shape = batch, feature_dim, feature_size[0], feature_size[1] + + Firstly, an attention block will be used to stable the feature projecting process + + Secondly, 1 CNN will be used to project the dimension + + Finally, flattern and transpose will be applied + + output 2 attention guidance: glance, glance + x.shape = batch, patch_number, patch_dim + """ + + def __init__(self, patch_size=1, target_feature_size=(7, 7), feature_size=(56, 56), feature_dim=256, embed_dim=768, + Attention_module=None, norm_layer=nn.LayerNorm): + super().__init__() + patch_size = to_2tuple(patch_size) + feature_size = to_2tuple(feature_size) + + target_feature_size = to_2tuple(target_feature_size) + + assert feature_size[0] % target_feature_size[0] == 0 and feature_size[1] % target_feature_size[1] == 0 + + assert target_feature_size[0] % patch_size[0] == 0 and target_feature_size[1] % patch_size[1] == 0 + + if Attention_module is not None: + if Attention_module == 'SimAM': + self.Attention_module = simam_module(e_lambda=1e-4) + elif Attention_module == 'CBAM': + self.Attention_module = cbam_module(gate_channels=feature_dim) + elif Attention_module == 'SE': + self.Attention_module = se_module(channel=feature_dim) + else: + self.Attention_module = None + + self.focus_size = (feature_size[0] // target_feature_size[0], feature_size[1] // target_feature_size[1]) + + self.grid_size = (self.focus_size[0] * patch_size[0], self.focus_size[1] * patch_size[1]) + self.num_patches = (feature_size[0] // self.grid_size[0]) * (feature_size[1] // self.grid_size[1]) + + self.proj = nn.Conv2d(in_channels=feature_dim, out_channels=embed_dim, + kernel_size=self.grid_size, stride=self.grid_size) + + self.norm_f = norm_layer(embed_dim) + + def forward(self, x): + if self.Attention_module is not None: + x = self.Attention_module(x) + + if isinstance(x, (list, tuple)): + x = x[-1] # last feature if backbone outputs list/tuple of features + + q = self.norm_f(self.proj(x).flatten(2).transpose(1, 2)) + k = q + """ + x.shape: batch, feature_dim, feature_size[0], feature_size[1] + proj(x).shape: batch, embed_dim, patch_height_num, patch_width_num + flatten(2).shape: batch, embed_dim, patch_num + .transpose(1, 2).shape: batch feature_patch_number feature_patch_dim + """ + # output x.shape = batch, patch_number, patch_dim + return q, k + + +class VisionTransformer(nn.Module): # From timm to review the ViT and ViT_resn5 + """ + Vision Transformer + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` + - https://arxiv.org/abs/2010.11929 + Includes distillation token & head support for `DeiT: Data-efficient Image Transformers` + - https://arxiv.org/abs/2012.12877 + """ + + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=True, representation_size=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., embed_layer=PatchEmbed, norm_layer=None, act_layer=None): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + num_classes (int): number of classes for classification head + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set + drop_rate (float): dropout rate + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + embed_layer (nn.Module): patch embedding layer + norm_layer: (nn.Module): normalization layer + """ + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.num_tokens = 1 + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + act_layer = act_layer or nn.GELU + + self.patch_embed = embed_layer( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + + self.blocks = nn.Sequential(*[ + Encoder_Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop_rate, + attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, act_layer=act_layer) + for i in range(depth)]) + + self.norm = norm_layer(embed_dim) + + # Representation layer + if representation_size: + self.num_features = representation_size + self.pre_logits = nn.Sequential(OrderedDict([ + ('fc', nn.Linear(embed_dim, representation_size)), + ('act', nn.Tanh()) + ])) + else: + self.pre_logits = nn.Identity() + + # Classifier head(s) + self.head = nn.Linear(self.num_features, self.num_classes) if self.num_classes > 0 else nn.Identity() + self.head_dist = None + + def forward_features(self, x): + x = self.patch_embed(x) + # print(x.shape,self.pos_embed.shape) + cls_token = self.cls_token.expand(x.shape[0], -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_token, x), dim=1) + x = self.pos_drop(x + self.pos_embed) + + x = self.blocks(x) + x = self.norm(x) + return self.pre_logits(x[:, 0]) # use cls token for cls head + + def forward(self, x): + x = self.forward_features(x) + x = self.head(x) + return x + + +class Stage_wise_hybrid_Transformer(nn.Module): + """ + MSHT: Multi Stage Backbone Transformer + Stem + 4 ResNet stages(Backbone)is used as backbone + then, last feature map patch embedding is used to connect the CNN output to the decoder1 input + + horizonally, 4 ResNet Stage has its feature map connecting to the Focus module + which we be use as attention guidance into the FGD decoder + """ + + def __init__(self, backbone, num_classes=1000, patch_size=1, embed_dim=768, depth=4, num_heads=8, mlp_ratio=4., + qkv_bias=True, representation_size=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., + use_cls_token=True, use_pos_embedding=True, use_att_module='SimAM', stage_size=(56, 28, 14, 7), + stage_dim=(256, 512, 1024, 2048), norm_layer=None, act_layer=None): + """ + Args: + backbone (nn.Module): input backbone = stem + 4 ResNet stages + num_classes (int): number of classes for classification head + patch_size (int, tuple): patch size + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set + drop_rate (float): dropout rate + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + + use_cls_token(bool): classification token + use_pos_embedding(bool): use positional embedding + use_att_module(str or None): use which attention module in embedding + + stage_size (int, tuple): the stage feature map size of ResNet stages + stage_dim (int, tuple): the stage feature map dimension of ResNet stages + norm_layer: (nn.Module): normalization layer + """ + super().__init__() + self.num_classes = num_classes + if len(stage_dim) != len(stage_size): + raise TypeError('stage_dim and stage_size mismatch!') + else: + self.stage_num = len(stage_dim) + + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + self.cls_token_num = 1 if use_cls_token else 0 + self.use_pos_embedding = use_pos_embedding + + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + act_layer = act_layer or nn.GELU + + # backbone CNN + self.backbone = backbone + + # Attention module + if use_att_module is not None: + if use_att_module in ['SimAM', 'CBAM', 'SE']: + Attention_module = use_att_module + else: + Attention_module = None + else: + Attention_module = None + + self.patch_embed = Last_feature_map_Embed(patch_size=patch_size, feature_size=stage_size[-1], + feature_dim=stage_dim[-1], embed_dim=self.embed_dim, + Attention_module=Attention_module) + num_patches = self.patch_embed.num_patches + + # global sharing cls token and positional embedding + self.cls_token_0 = nn.Parameter(torch.zeros(1, 1, embed_dim)) # like message token + if self.use_pos_embedding: + self.pos_embed_0 = nn.Parameter(torch.zeros(1, num_patches + self.cls_token_num, embed_dim)) + + ''' + self.cls_token_1 = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed_1 = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim)) + + self.cls_token_2 = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed_2 = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim)) + + self.cls_token_3 = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed_3 = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim)) + + self.cls_token_4 = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed_4 = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim)) + ''' + + self.pos_drop = nn.Dropout(p=drop_rate) + # stochastic depth + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + + self.dec1 = Decoder_Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[0], norm_layer=norm_layer, + act_layer=act_layer) + self.Fo1 = Focus_Embed(patch_size=patch_size, target_feature_size=stage_size[-1], feature_size=stage_size[0], + feature_dim=stage_dim[0], embed_dim=embed_dim, Attention_module=Attention_module, + norm_layer=norm_layer) + + self.dec2 = Decoder_Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[1], norm_layer=norm_layer, + act_layer=act_layer) + self.Fo2 = Focus_Embed(patch_size=patch_size, target_feature_size=stage_size[-1], feature_size=stage_size[1], + feature_dim=stage_dim[1], embed_dim=embed_dim, Attention_module=Attention_module, + norm_layer=norm_layer) + + self.dec3 = Decoder_Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[2], norm_layer=norm_layer, + act_layer=act_layer) + self.Fo3 = Focus_Embed(patch_size=patch_size, target_feature_size=stage_size[-1], feature_size=stage_size[2], + feature_dim=stage_dim[2], embed_dim=embed_dim, Attention_module=Attention_module, + norm_layer=norm_layer) + + if self.stage_num == 4: + self.dec4 = Decoder_Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[3], norm_layer=norm_layer, + act_layer=act_layer) + self.Fo4 = Focus_Embed(patch_size=patch_size, target_feature_size=stage_size[-1], + feature_size=stage_size[-1], + feature_dim=stage_dim[-1], embed_dim=embed_dim, Attention_module=Attention_module, + norm_layer=norm_layer) + + self.norm = norm_layer(embed_dim) + + # Representation layer + if representation_size: + self.num_features = representation_size + self.pre_logits = nn.Sequential(OrderedDict([ + ('fc', nn.Linear(embed_dim, representation_size)), + ('act', nn.Tanh()) + ])) + else: + self.pre_logits = nn.Identity() + + # Classifier head(s) + self.head = nn.Linear(self.num_features, self.num_classes) if self.num_classes > 0 else nn.Identity() + self.head_dist = None + + def forward_features(self, x): + if self.stage_num == 3: + stage1_out, stage2_out, stage3_out = self.backbone(x) + # embedding the last feature map + x = self.patch_embed(stage3_out) + + elif self.stage_num == 4: + stage1_out, stage2_out, stage3_out, stage4_out = self.backbone(x) + # embedding the last feature map + x = self.patch_embed(stage4_out) + else: + raise TypeError('stage_dim is not legal !') + + # get guidance info + s1_q, s1_k = self.Fo1(stage1_out) + s2_q, s2_k = self.Fo2(stage2_out) + s3_q, s3_k = self.Fo3(stage3_out) + if self.stage_num == 4: + s4_q, s4_k = self.Fo4(stage4_out) + + if self.cls_token_num != 0: # concat cls token + # process the(cls token / message token) + cls_token_0 = self.cls_token_0.expand(x.shape[0], -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_token_0, x), dim=1) # 增加classification head patch + + s1_q = torch.cat((cls_token_0, s1_q), dim=1) + s1_k = torch.cat((cls_token_0, s1_k), dim=1) + s2_q = torch.cat((cls_token_0, s2_q), dim=1) + s2_k = torch.cat((cls_token_0, s2_k), dim=1) + s3_q = torch.cat((cls_token_0, s3_q), dim=1) + s3_k = torch.cat((cls_token_0, s3_k), dim=1) + if self.stage_num == 4: + s4_q = torch.cat((cls_token_0, s4_q), dim=1) + s4_k = torch.cat((cls_token_0, s4_k), dim=1) + + if self.use_pos_embedding: + + s1_q = self.pos_drop(s1_q + self.pos_embed_0) + s1_k = self.pos_drop(s1_k + self.pos_embed_0) + s2_q = self.pos_drop(s2_q + self.pos_embed_0) + s2_k = self.pos_drop(s2_k + self.pos_embed_0) + s3_q = self.pos_drop(s3_q + self.pos_embed_0) + s3_k = self.pos_drop(s3_k + self.pos_embed_0) + if self.stage_num == 4: + s4_q = self.pos_drop(s4_q + self.pos_embed_0) + s4_k = self.pos_drop(s4_k + self.pos_embed_0) + + # plus to encoding positional infor + x = self.pos_drop(x + self.pos_embed_0) + + else: + + s1_q = self.pos_drop(s1_q) + s1_k = self.pos_drop(s1_k) + s2_q = self.pos_drop(s2_q) + s2_k = self.pos_drop(s2_k) + s3_q = self.pos_drop(s3_q) + s3_k = self.pos_drop(s3_k) + if self.stage_num == 4: + s4_q = self.pos_drop(s4_q) + s4_k = self.pos_drop(s4_k) + + # stem's feature map + x = self.pos_drop(x) + + # Decoder module use the guidance to help global modeling process + + x = self.dec1(s1_q, s1_k, x) + + x = self.dec2(s2_q, s2_k, x) + + x = self.dec3(s3_q, s3_k, x) + + if self.stage_num == 4: + x = self.dec4(s4_q, s4_k, x) + + x = self.norm(x) + return self.pre_logits(x[:, 0]) # take the first cls token + + def forward(self, x): + x = self.forward_features(x) # connect the cls token to the cls head + x = self.head(x) + return x diff --git a/PuzzleTuning/Backbone/VPT_structure.py b/PuzzleTuning/Backbone/VPT_structure.py new file mode 100644 index 0000000000000000000000000000000000000000..5c8802c71295c7a20103bdc0cf19484a7194e84a --- /dev/null +++ b/PuzzleTuning/Backbone/VPT_structure.py @@ -0,0 +1,133 @@ +""" +VPT Script ver: Oct 17th 14:30 + +based on +timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm +""" + +import torch +import torch.nn as nn + +from timm.models.vision_transformer import VisionTransformer, PatchEmbed + + +class VPT_ViT(VisionTransformer): + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., + embed_layer=PatchEmbed, norm_layer=None, act_layer=None, Prompt_Token_num=1, + VPT_type="Shallow", basic_state_dict=None): + + # Recreate ViT + super().__init__(img_size=img_size, patch_size=patch_size, in_chans=in_chans, num_classes=num_classes, + embed_dim=embed_dim, depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, + drop_path_rate=drop_path_rate, embed_layer=embed_layer, + norm_layer=norm_layer, act_layer=act_layer) + + # load basic state_dict + if basic_state_dict is not None: + self.load_state_dict(basic_state_dict, False) + + self.VPT_type = VPT_type + if VPT_type == "Deep": + self.Prompt_Tokens = nn.Parameter(torch.zeros(depth, Prompt_Token_num, embed_dim)) + else: # "Shallow" + self.Prompt_Tokens = nn.Parameter(torch.zeros(1, Prompt_Token_num, embed_dim)) + + def New_CLS_head(self, new_classes=15): + if new_classes != 0: + self.head = nn.Linear(self.embed_dim, new_classes) + else: + self.head = nn.Identity() + + def Freeze(self): + for param in self.parameters(): + param.requires_grad = False + + self.Prompt_Tokens.requires_grad = True + try: + for param in self.head.parameters(): + param.requires_grad = True + except: + pass + + def UnFreeze(self): + for param in self.parameters(): + param.requires_grad = True + + def obtain_prompt(self): + prompt_state_dict = {'head': self.head.state_dict(), + 'Prompt_Tokens': self.Prompt_Tokens} + # print(prompt_state_dict) + return prompt_state_dict + + def load_prompt(self, prompt_state_dict): + try: + self.head.load_state_dict(prompt_state_dict['head'], False) + except: + print('head not match, so skip head') + else: + print('prompt head match') + + if self.Prompt_Tokens.shape == prompt_state_dict['Prompt_Tokens'].shape: + + # device check + Prompt_Tokens = nn.Parameter(prompt_state_dict['Prompt_Tokens'].cpu()) + Prompt_Tokens.to(torch.device(self.Prompt_Tokens.device)) + + self.Prompt_Tokens = Prompt_Tokens + + else: + print('\n !!! cannot load prompt') + print('shape of model req prompt', self.Prompt_Tokens.shape) + print('shape of model given prompt', prompt_state_dict['Prompt_Tokens'].shape) + print('') + + def forward_features(self, x): + x = self.patch_embed(x) + # print(x.shape,self.pos_embed.shape) + cls_token = self.cls_token.expand(x.shape[0], -1, -1) + + # concatenate CLS token + x = torch.cat((cls_token, x), dim=1) + x = self.pos_drop(x + self.pos_embed) + + if self.VPT_type == "Deep": + + Prompt_Token_num = self.Prompt_Tokens.shape[1] + + for i in range(len(self.blocks)): + # concatenate Prompt_Tokens + Prompt_Tokens = self.Prompt_Tokens[i].unsqueeze(0) + # firstly concatenate + x = torch.cat((x, Prompt_Tokens.expand(x.shape[0], -1, -1)), dim=1) + num_tokens = x.shape[1] + # lastly remove, a genius trick + x = self.blocks[i](x)[:, :num_tokens - Prompt_Token_num] + + else: # self.VPT_type == "Shallow" + Prompt_Token_num = self.Prompt_Tokens.shape[1] + + # concatenate Prompt_Tokens + Prompt_Tokens = self.Prompt_Tokens.expand(x.shape[0], -1, -1) + x = torch.cat((x, Prompt_Tokens), dim=1) + num_tokens = x.shape[1] + # Sequntially procees + x = self.blocks(x)[:, :num_tokens - Prompt_Token_num] + + x = self.norm(x) + return x + + def forward(self, x): + + x = self.forward_features(x) + + # use cls token for cls head + try: + x = self.pre_logits(x[:, 0, :]) + except: + x = self.fc_norm(x[:, 0, :]) + else: + pass + x = self.head(x) + return x diff --git a/PuzzleTuning/Backbone/attention_modules.py b/PuzzleTuning/Backbone/attention_modules.py new file mode 100644 index 0000000000000000000000000000000000000000..207b2cbc4f08b996489e4fc757a3afdb08194025 --- /dev/null +++ b/PuzzleTuning/Backbone/attention_modules.py @@ -0,0 +1,303 @@ +""" +attention modules in ['SimAM', 'CBAM', 'SE', 'GAM'] were applied in the ablation study + +ver: Dec 24th 15:00 + + +ref: +https://github.com/xmu-xiaoma666/External-Attention-pytorch +""" + +import torch +import torch.nn as nn +import math +import torch.nn.functional as F +from torch.nn import init + + +# help func +class BasicConv(nn.Module): + def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, + bn=True, bias=False): + super(BasicConv, self).__init__() + self.out_channels = out_planes + self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, + dilation=dilation, groups=groups, bias=bias) + self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True) if bn else None + self.relu = nn.ReLU() if relu else None + + def forward(self, x): + x = self.conv(x) + if self.bn is not None: + x = self.bn(x) + if self.relu is not None: + x = self.relu(x) + return x + + +class Flatten(nn.Module): + def forward(self, x): + return x.view(x.size(0), -1) + + +class ChannelGate(nn.Module): + def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max']): + super(ChannelGate, self).__init__() + self.gate_channels = gate_channels + self.mlp = nn.Sequential( + Flatten(), + nn.Linear(gate_channels, int(gate_channels // reduction_ratio)), + nn.ReLU(), + nn.Linear(int(gate_channels // reduction_ratio), gate_channels) + ) + self.pool_types = pool_types + + def forward(self, x): + channel_att_sum = None + for pool_type in self.pool_types: + if pool_type == 'avg': + avg_pool = F.avg_pool2d(x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3))) + channel_att_raw = self.mlp(avg_pool) + elif pool_type == 'max': + max_pool = F.max_pool2d(x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3))) + channel_att_raw = self.mlp(max_pool) + elif pool_type == 'lp': + lp_pool = F.lp_pool2d(x, 2, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3))) + channel_att_raw = self.mlp(lp_pool) + elif pool_type == 'lse': + # LSE pool only + lse_pool = logsumexp_2d(x) + channel_att_raw = self.mlp(lse_pool) + + if channel_att_sum is None: + channel_att_sum = channel_att_raw + else: + channel_att_sum = channel_att_sum + channel_att_raw + + scale = F.sigmoid(channel_att_sum).unsqueeze(2).unsqueeze(3).expand_as(x) + return x * scale + + +def logsumexp_2d(tensor): + tensor_flatten = tensor.view(tensor.size(0), tensor.size(1), -1) + s, _ = torch.max(tensor_flatten, dim=2, keepdim=True) + outputs = s + (tensor_flatten - s).exp().sum(dim=2, keepdim=True).log() + return outputs + + +class ChannelPool(nn.Module): + def forward(self, x): + return torch.cat((torch.max(x, 1)[0].unsqueeze(1), torch.mean(x, 1).unsqueeze(1)), dim=1) + + +class SpatialGate(nn.Module): + def __init__(self): + super(SpatialGate, self).__init__() + kernel_size = 7 + self.compress = ChannelPool() + self.spatial = BasicConv(2, 1, kernel_size, stride=1, padding=int((kernel_size - 1) // 2), relu=False) + + def forward(self, x): + x_compress = self.compress(x) + x_out = self.spatial(x_compress) + scale = F.sigmoid(x_out) # broadcasting + return x * scale + + +# attention modules: +class cbam_module(nn.Module): + """ + module:CBAM + + input、output= b, c, h, w + + paper: + https://arxiv.org/abs/1807.06521 + code: + https://github.com/ZjjConan/SimAM/blob/master/networks/attentions + """ + + def __init__(self, gate_channels, reduction=16, pool_types=['avg', 'max'], no_spatial=False): + super(cbam_module, self).__init__() + self.ChannelGate = ChannelGate(gate_channels, reduction, pool_types) + self.no_spatial = no_spatial + if not no_spatial: + self.SpatialGate = SpatialGate() + + @staticmethod + def get_module_name(): + return "cbam" + + def forward(self, x): + x_out = self.ChannelGate(x) + if not self.no_spatial: + x_out = self.SpatialGate(x_out) + return x_out + + +class se_module(nn.Module): + """ + module: SE + + input、output= b, c, h, w + + from paper Squeeze-and-Excitation Networks + SE-Net https://arxiv.org/abs/1709.01507 + code: + https://github.com/ZjjConan/SimAM/blob/master/networks/attentions + """ + + def __init__(self, channel, reduction=16): + super(se_module, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Linear(channel, int(channel // reduction), bias=False), + nn.ReLU(inplace=True), + nn.Linear(int(channel // reduction), channel, bias=False), + nn.Sigmoid() + ) + + @staticmethod + def get_module_name(): + return "se" + + def forward(self, x): + b, c, _, _ = x.size() + y = self.avg_pool(x).view(b, c) + y = self.fc(y).view(b, c, 1, 1) + return x * y + + +class simam_module(torch.nn.Module): + """ + module:SimAM + + input、output= b, c, h, w + + paper:(ICML) + SimAM: A Simple, Parameter-Free Attention Module for Convolutional Neural Networks + code: + https://github.com/ZjjConan/SimAM/blob/master/networks/attentions/simam_module.py + """ + + def __init__(self, channels=None, e_lambda=1e-4): + super(simam_module, self).__init__() + + self.activaton = nn.Sigmoid() + self.e_lambda = e_lambda + + def __repr__(self): + s = self.__class__.__name__ + '(' + s += ('lambda=%f)' % self.e_lambda) + return s + + @staticmethod + def get_module_name(): + return "simam" + + def forward(self, x): + b, c, h, w = x.size() + + n = w * h - 1 + + x_minus_mu_square = (x - x.mean(dim=[2, 3], keepdim=True)).pow(2) + y = x_minus_mu_square / (4 * (x_minus_mu_square.sum(dim=[2, 3], keepdim=True) / n + self.e_lambda)) + 0.5 + + return x * self.activaton(y) + + +class ResidualAttention(nn.Module): + """ + module: ResidualAttention + + input、output= b, c, h, w + + Paper:ICCV 2021 Residual Attention: A Simple but Effective Method for Multi-Label Recognition + code:https://github.com/xmu-xiaoma666/External-Attention-pytorch/blob/master/attention/ResidualAttention.py + """ + + def __init__(self, channel=512, num_class=1000, la=0.2): + super().__init__() + self.la = la + self.fc = nn.Conv2d(in_channels=channel, out_channels=num_class, kernel_size=1, stride=1, bias=False) + + def forward(self, x): + b, c, h, w = x.shape + y_raw = self.fc(x).flatten(2) # b,num_class,hxw + y_avg = torch.mean(y_raw, dim=2) # b,num_class + y_max = torch.max(y_raw, dim=2)[0] # b,num_class + score = y_avg + self.la * y_max + return score + + +class eca_module(nn.Module): + """Constructs a ECA module. + + Args: + channel: Number of channels of the input feature map + k_size: Adaptive selection of kernel size + """ + def __init__(self, channel, k_size=3): + super(eca_module, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.conv = nn.Conv1d(1, 1, kernel_size=k_size, padding=(k_size - 1) // 2, bias=False) + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + # x: input features with shape [b, c, h, w] + b, c, h, w = x.size() + + # feature descriptor on the global spatial information + y = self.avg_pool(x) + + # Two different branches of ECA module + y = self.conv(y.squeeze(-1).transpose(-1, -2)).transpose(-1, -2).unsqueeze(-1) + + # Multi-scale information fusion + y = self.sigmoid(y) + + return x * y.expand_as(x) + + +class GAM_Attention(nn.Module): + """ + module:GAM + + input= b, in_channels, h, w + output= b, out_channels, h, w + + paper: + Global Attention Mechanism: Retain Information to Enhance Channel-Spatial Interactions + https://arxiv.org/abs/2112.05561 + code: + https://mp.weixin.qq.com/s/VL6rXjyUDmHToYTqM32hUg + """ + def __init__(self, in_channels, out_channels, rate=4): + super(GAM_Attention, self).__init__() + + self.channel_attention = nn.Sequential( + nn.Linear(in_channels, int(in_channels / rate)), + nn.ReLU(inplace=True), + nn.Linear(int(in_channels / rate), in_channels) + ) + + self.spatial_attention = nn.Sequential( + nn.Conv2d(in_channels, int(in_channels / rate), kernel_size=7, padding=3), + nn.BatchNorm2d(int(in_channels / rate)), + nn.ReLU(inplace=True), + nn.Conv2d(int(in_channels / rate), out_channels, kernel_size=7, padding=3), + nn.BatchNorm2d(out_channels) + ) + + def forward(self, x): + b, c, h, w = x.shape + x_permute = x.permute(0, 2, 3, 1).view(b, -1, c) + x_att_permute = self.channel_attention(x_permute).view(b, h, w, c) + x_channel_att = x_att_permute.permute(0, 3, 1, 2) + + x = x * x_channel_att + + x_spatial_att = self.spatial_attention(x).sigmoid() + out = x * x_spatial_att + + return out diff --git a/PuzzleTuning/Backbone/counterpart_models/README.md b/PuzzleTuning/Backbone/counterpart_models/README.md new file mode 100644 index 0000000000000000000000000000000000000000..cb3ec6bc9f3d5afddec97e5c5cb6952e43745187 --- /dev/null +++ b/PuzzleTuning/Backbone/counterpart_models/README.md @@ -0,0 +1,25 @@ +Recent SOTA works in fine-grained Tasks + + +CrossFormer + +Paper: +https://arxiv.org/pdf/2108.00154.pdf + + +Code from: +https://github.com/cheerss/CrossFormer + + + +Conformer +Paper: +https://arxiv.org/pdf/2105.03889.pdf + + +Code from: +https://github.com/pengzhiliang/Conformer/blob/main/conformer.py + + +both work will be compared with official pretrained backbone +and a new MLP head (classification head). \ No newline at end of file diff --git a/PuzzleTuning/Backbone/counterpart_models/conformer.py b/PuzzleTuning/Backbone/counterpart_models/conformer.py new file mode 100644 index 0000000000000000000000000000000000000000..2ca3fcba4a6f815be98545b9e9594ce58fb84d10 --- /dev/null +++ b/PuzzleTuning/Backbone/counterpart_models/conformer.py @@ -0,0 +1,483 @@ +""" +From Conformer with alter: conv and trans cls head was changed to volting together +ver: DEC 1st 16:00 official release + +ref: https://github.com/pengzhiliang/Conformer/blob/main/conformer.py +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F +from functools import partial + +from timm.models.layers import DropPath, trunc_normal_ + + +class Mlp(nn.Module): # FFN + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): # MHSA + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape # N is patch number, C is patch dimension + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) # re arrange + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): # Encoder from ViT + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=partial(nn.LayerNorm, eps=1e-6)): + super().__init__() + # pre norm 1 + self.norm1 = norm_layer(dim) + # MHSA + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + # pre norm 2 + self.norm2 = norm_layer(dim) + + # FFN(MLP) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class ConvBlock(nn.Module): # ResNet bottleneck Convblock actually + + def __init__(self, inplanes, outplanes, stride=1, res_conv=False, act_layer=nn.ReLU, groups=1, + norm_layer=partial(nn.BatchNorm2d, eps=1e-6), drop_block=None, drop_path=None): + super(ConvBlock, self).__init__() + + expansion = 4 + med_planes = outplanes // expansion + + self.conv1 = nn.Conv2d(inplanes, med_planes, kernel_size=1, stride=1, padding=0, bias=False) + self.bn1 = norm_layer(med_planes) + self.act1 = act_layer(inplace=True) + + self.conv2 = nn.Conv2d(med_planes, med_planes, kernel_size=3, stride=stride, groups=groups, padding=1, + bias=False) + self.bn2 = norm_layer(med_planes) + self.act2 = act_layer(inplace=True) + + self.conv3 = nn.Conv2d(med_planes, outplanes, kernel_size=1, stride=1, padding=0, bias=False) + self.bn3 = norm_layer(outplanes) + self.act3 = act_layer(inplace=True) + + if res_conv: + self.residual_conv = nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=stride, padding=0, bias=False) + self.residual_bn = norm_layer(outplanes) + + self.res_conv = res_conv + self.drop_block = drop_block + self.drop_path = drop_path + + def zero_init_last_bn(self): + nn.init.zeros_(self.bn3.weight) + + def forward(self, x, x_t=None, return_x_2=True): + residual = x + + x = self.conv1(x) + x = self.bn1(x) + if self.drop_block is not None: + x = self.drop_block(x) + x = self.act1(x) + + x = self.conv2(x) if x_t is None else self.conv2(x + x_t) + x = self.bn2(x) + if self.drop_block is not None: + x = self.drop_block(x) + x2 = self.act2(x) + + x = self.conv3(x2) + x = self.bn3(x) + if self.drop_block is not None: + x = self.drop_block(x) + + if self.drop_path is not None: + x = self.drop_path(x) + + if self.res_conv: + residual = self.residual_conv(residual) + residual = self.residual_bn(residual) + + x += residual + x = self.act3(x) + + if return_x_2: + return x, x2 + else: + return x + + +class FCUDown(nn.Module): + """ CNN feature maps -> Transformer patch embeddings + """ + + def __init__(self, inplanes, outplanes, dw_stride, act_layer=nn.GELU, + norm_layer=partial(nn.LayerNorm, eps=1e-6)): + super(FCUDown, self).__init__() + self.dw_stride = dw_stride + + self.conv_project = nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=1, padding=0) # fix dimension + self.sample_pooling = nn.AvgPool2d(kernel_size=dw_stride, stride=dw_stride) # fix feature map size + + self.ln = norm_layer(outplanes) + self.act = act_layer() + + def forward(self, x, x_t): + x = self.conv_project(x) # [N, C, H, W] + + x = self.sample_pooling(x).flatten(2).transpose(1, 2) + x = self.ln(x) + x = self.act(x) + + x = torch.cat([x_t[:, 0][:, None, :], x], dim=1) # concatenate class token from x_t + + return x + + +class FCUUp(nn.Module): + """ Transformer patch embeddings -> CNN feature maps + by interpolate operation + """ + + def __init__(self, inplanes, outplanes, up_stride, act_layer=nn.ReLU, + norm_layer=partial(nn.BatchNorm2d, eps=1e-6), ): + super(FCUUp, self).__init__() + + self.up_stride = up_stride + self.conv_project = nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=1, padding=0) + self.bn = norm_layer(outplanes) + self.act = act_layer() + + def forward(self, x, H, W): # interpolate to + B, _, C = x.shape + + # [N, 197, 384] -> [N, 196, 384] -> [N, 384, 196] -> [N, 384, 14, 14] + x_r = x[:, 1:].transpose(1, 2).reshape(B, C, H, W) # drop cls token of x_t + + x_r = self.act(self.bn(self.conv_project(x_r))) + + return F.interpolate(x_r, size=(H * self.up_stride, W * self.up_stride)) # interpolate operation + + +class Med_ConvBlock(nn.Module): # ResNet bottleneck indentity actually + """ special case for Convblock without down sampling, + """ + + def __init__(self, inplanes, act_layer=nn.ReLU, groups=1, norm_layer=partial(nn.BatchNorm2d, eps=1e-6), + drop_block=None, drop_path=None): + + super(Med_ConvBlock, self).__init__() + + expansion = 4 + med_planes = inplanes // expansion + + self.conv1 = nn.Conv2d(inplanes, med_planes, kernel_size=1, stride=1, padding=0, bias=False) + self.bn1 = norm_layer(med_planes) + self.act1 = act_layer(inplace=True) + + self.conv2 = nn.Conv2d(med_planes, med_planes, kernel_size=3, stride=1, groups=groups, padding=1, bias=False) + self.bn2 = norm_layer(med_planes) + self.act2 = act_layer(inplace=True) + + self.conv3 = nn.Conv2d(med_planes, inplanes, kernel_size=1, stride=1, padding=0, bias=False) + self.bn3 = norm_layer(inplanes) + self.act3 = act_layer(inplace=True) + + self.drop_block = drop_block + self.drop_path = drop_path + + def zero_init_last_bn(self): + nn.init.zeros_(self.bn3.weight) + + def forward(self, x): + residual = x + + x = self.conv1(x) + x = self.bn1(x) + if self.drop_block is not None: + x = self.drop_block(x) + x = self.act1(x) + + x = self.conv2(x) + x = self.bn2(x) + if self.drop_block is not None: + x = self.drop_block(x) + x = self.act2(x) + + x = self.conv3(x) + x = self.bn3(x) + if self.drop_block is not None: + x = self.drop_block(x) + + if self.drop_path is not None: + x = self.drop_path(x) + + x += residual + x = self.act3(x) + + return x + + +class ConvTransBlock(nn.Module): + """ + Basic module for ConvTransformer, keep feature maps for CNN block and patch embeddings for transformer encoder block + """ + + def __init__(self, inplanes, outplanes, res_conv, stride, dw_stride, embed_dim, num_heads=12, mlp_ratio=4., + qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., + last_fusion=False, num_med_block=0, groups=1): + + super(ConvTransBlock, self).__init__() + expansion = 4 + # ConvBlock + self.cnn_block = ConvBlock(inplanes=inplanes, outplanes=outplanes, res_conv=res_conv, stride=stride, + groups=groups) + + if last_fusion: + self.fusion_block = ConvBlock(inplanes=outplanes, outplanes=outplanes, stride=2, res_conv=True, + groups=groups) + else: + self.fusion_block = ConvBlock(inplanes=outplanes, outplanes=outplanes, groups=groups) + + # identity block + if num_med_block > 0: + self.med_block = [] + for i in range(num_med_block): + self.med_block.append(Med_ConvBlock(inplanes=outplanes, groups=groups)) + + self.med_block = nn.ModuleList(self.med_block) # nn.ModuleList + + # FCU + self.squeeze_block = FCUDown(inplanes=outplanes // expansion, outplanes=embed_dim, dw_stride=dw_stride) + + self.expand_block = FCUUp(inplanes=embed_dim, outplanes=outplanes // expansion, up_stride=dw_stride) + + # Transformer Encoder block + self.trans_block = Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=drop_path_rate) + + self.dw_stride = dw_stride + self.embed_dim = embed_dim + self.num_med_block = num_med_block + self.last_fusion = last_fusion + + def forward(self, x, x_t): + x, x2 = self.cnn_block(x) + + _, _, H, W = x2.shape + + x_st = self.squeeze_block(x2, x_t) + + x_t = self.trans_block(x_st + x_t) + + if self.num_med_block > 0: + for m in self.med_block: + x = m(x) + + x_t_r = self.expand_block(x_t, H // self.dw_stride, W // self.dw_stride) + x = self.fusion_block(x, x_t_r, return_x_2=False) + + return x, x_t + + +class Conformer(nn.Module): + + def __init__(self, patch_size=16, in_chans=3, num_classes=1000, base_channel=64, channel_ratio=4, num_med_block=0, + embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0.): + + # Transformer + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + assert depth % 3 == 0 + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.trans_dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + + # Classifier head + self.trans_norm = nn.LayerNorm(embed_dim) + self.trans_cls_head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + self.pooling = nn.AdaptiveAvgPool2d(1) + self.conv_cls_head = nn.Linear(int(256 * channel_ratio), num_classes) + self.cls_head = nn.Linear(int(2 * num_classes), num_classes) + + # Stem stage: get the feature maps by conv block (copied form resnet.py) + self.conv1 = nn.Conv2d(in_chans, 64, kernel_size=7, stride=2, padding=3, bias=False) # 1 / 2 [112, 112] + self.bn1 = nn.BatchNorm2d(64) + self.act1 = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) # 1 / 4 [56, 56] + + # 1 stage + stage_1_channel = int(base_channel * channel_ratio) + trans_dw_stride = patch_size // 4 + self.conv_1 = ConvBlock(inplanes=64, outplanes=stage_1_channel, res_conv=True, stride=1) + # embedding + self.trans_patch_conv = nn.Conv2d(64, embed_dim, kernel_size=trans_dw_stride, stride=trans_dw_stride, padding=0) + self.trans_1 = Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, + qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=self.trans_dpr[0], + ) + + # 2~4 stage + init_stage = 2 + fin_stage = depth // 3 + 1 + for i in range(init_stage, fin_stage): + self.add_module('conv_trans_' + str(i), + ConvTransBlock( + stage_1_channel, stage_1_channel, False, 1, dw_stride=trans_dw_stride, + embed_dim=embed_dim, + num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, + drop_path_rate=self.trans_dpr[i - 1], + num_med_block=num_med_block + ) + ) + + stage_2_channel = int(base_channel * channel_ratio * 2) + # 5~8 stage + init_stage = fin_stage # 5 + fin_stage = fin_stage + depth // 3 # 9 + for i in range(init_stage, fin_stage): + s = 2 if i == init_stage else 1 + in_channel = stage_1_channel if i == init_stage else stage_2_channel + res_conv = True if i == init_stage else False + self.add_module('conv_trans_' + str(i), + ConvTransBlock( + in_channel, stage_2_channel, res_conv, s, dw_stride=trans_dw_stride // 2, + embed_dim=embed_dim, + num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, + drop_path_rate=self.trans_dpr[i - 1], + num_med_block=num_med_block + ) + ) + + stage_3_channel = int(base_channel * channel_ratio * 2 * 2) + # 9~12 stage + init_stage = fin_stage # 9 + fin_stage = fin_stage + depth // 3 # 13 + for i in range(init_stage, fin_stage): + s = 2 if i == init_stage else 1 + in_channel = stage_2_channel if i == init_stage else stage_3_channel + res_conv = True if i == init_stage else False + last_fusion = True if i == depth else False + self.add_module('conv_trans_' + str(i), + ConvTransBlock( + in_channel, stage_3_channel, res_conv, s, dw_stride=trans_dw_stride // 4, + embed_dim=embed_dim, + num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, + drop_path_rate=self.trans_dpr[i - 1], + num_med_block=num_med_block, last_fusion=last_fusion + ) + ) + self.fin_stage = fin_stage + + trunc_normal_(self.cls_token, std=.02) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1.) + nn.init.constant_(m.bias, 0.) + elif isinstance(m, nn.GroupNorm): + nn.init.constant_(m.weight, 1.) + nn.init.constant_(m.bias, 0.) + + @torch.jit.ignore + def no_weight_decay(self): + return {'cls_token'} + + def forward(self, x): + B = x.shape[0] + cls_tokens = self.cls_token.expand(B, -1, -1) + + # pdb.set_trace() + # stem stage [N, 3, 224, 224] -> [N, 64, 56, 56] + x_base = self.maxpool(self.act1(self.bn1(self.conv1(x)))) + + # 1 stage + x = self.conv_1(x_base, return_x_2=False) + # embedding: [N, 64, 56, 56] -> [N, d, p, p] -> [N, d, p^2] -> [N, p^2, d] -> [N, p^2 + 1, d] + x_t = self.trans_patch_conv(x_base).flatten(2).transpose(1, 2) + x_t = torch.cat([cls_tokens, x_t], dim=1) + x_t = self.trans_1(x_t) + + # 2 ~ final + for i in range(2, self.fin_stage): + x, x_t = eval('self.conv_trans_' + str(i))(x, x_t) + + # conv classification + x_p = self.pooling(x).flatten(1) + conv_cls = self.conv_cls_head(x_p) + + # trans classification + x_t = self.trans_norm(x_t) + tran_cls = self.trans_cls_head(x_t[:, 0]) + + # 加一个类别投票 + cls = torch.cat([conv_cls, tran_cls], dim=1) + cls = self.cls_head(cls) + return cls + + # return [conv_cls, tran_cls] diff --git a/PuzzleTuning/Backbone/counterpart_models/crossformer.py b/PuzzleTuning/Backbone/counterpart_models/crossformer.py new file mode 100644 index 0000000000000000000000000000000000000000..80f0784a75501b25202538bd67da0b2e24f6c7ee --- /dev/null +++ b/PuzzleTuning/Backbone/counterpart_models/crossformer.py @@ -0,0 +1,624 @@ +import torch +import torch.nn as nn +import torch.utils.checkpoint as checkpoint +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + +class DynamicPosBias(nn.Module): + def __init__(self, dim, num_heads, residual): + super().__init__() + self.residual = residual + self.num_heads = num_heads + self.pos_dim = dim // 4 + self.pos_proj = nn.Linear(2, self.pos_dim) + self.pos1 = nn.Sequential( + nn.LayerNorm(self.pos_dim), + nn.ReLU(inplace=True), + nn.Linear(self.pos_dim, self.pos_dim), + ) + self.pos2 = nn.Sequential( + nn.LayerNorm(self.pos_dim), + nn.ReLU(inplace=True), + nn.Linear(self.pos_dim, self.pos_dim) + ) + self.pos3 = nn.Sequential( + nn.LayerNorm(self.pos_dim), + nn.ReLU(inplace=True), + nn.Linear(self.pos_dim, self.num_heads) + ) + def forward(self, biases): + if self.residual: + pos = self.pos_proj(biases) # 2Wh-1 * 2Ww-1, heads + pos = pos + self.pos1(pos) + pos = pos + self.pos2(pos) + pos = self.pos3(pos) + else: + pos = self.pos3(self.pos2(self.pos1(self.pos_proj(biases)))) + return pos + + def flops(self, N): + flops = N * 2 * self.pos_dim + flops += N * self.pos_dim * self.pos_dim + flops += N * self.pos_dim * self.pos_dim + flops += N * self.pos_dim * self.num_heads + return flops + +class Attention(nn.Module): + r""" Multi-head self attention module with dynamic position bias. + + Args: + dim (int): Number of input channels. + group_size (tuple[int]): The height and width of the group. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__(self, dim, group_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0., + position_bias=True): + + super().__init__() + self.dim = dim + self.group_size = group_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + self.position_bias = position_bias + + if position_bias: + self.pos = DynamicPosBias(self.dim // 4, self.num_heads, residual=False) + + # generate mother-set + position_bias_h = torch.arange(1 - self.group_size[0], self.group_size[0]) + position_bias_w = torch.arange(1 - self.group_size[1], self.group_size[1]) + biases = torch.stack(torch.meshgrid([position_bias_h, position_bias_w])) # 2, 2Wh-1, 2W2-1 + biases = biases.flatten(1).transpose(0, 1).float() + self.register_buffer("biases", biases) + + # get pair-wise relative position index for each token inside the group + coords_h = torch.arange(self.group_size[0]) + coords_w = torch.arange(self.group_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.group_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.group_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.group_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """ + Args: + x: input features with shape of (num_groups*B, N, C) + mask: (0/-inf) mask with shape of (num_groups, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + if self.position_bias: + pos = self.pos(self.biases) # 2Wh-1 * 2Ww-1, heads + # select position bias + relative_position_bias = pos[self.relative_position_index.view(-1)].view( + self.group_size[0] * self.group_size[1], self.group_size[0] * self.group_size[1], -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + def extra_repr(self) -> str: + return f'dim={self.dim}, group_size={self.group_size}, num_heads={self.num_heads}' + + def flops(self, N): + # calculate flops for 1 group with token length of N + flops = 0 + # qkv = self.qkv(x) + flops += N * self.dim * 3 * self.dim + # attn = (q @ k.transpose(-2, -1)) + flops += self.num_heads * N * (self.dim // self.num_heads) * N + # x = (attn @ v) + flops += self.num_heads * N * N * (self.dim // self.num_heads) + # x = self.proj(x) + flops += N * self.dim * self.dim + if self.position_bias: + flops += self.pos.flops(N) + return flops + + +class CrossFormerBlock(nn.Module): + r""" CrossFormer Block. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resulotion. + num_heads (int): Number of attention heads. + group_size (int): Group size. + lsda_flag (int): use SDA or LDA, 0 for SDA and 1 for LDA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, input_resolution, num_heads, group_size=7, lsda_flag=0, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., + act_layer=nn.GELU, norm_layer=nn.LayerNorm, num_patch_size=1): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.num_heads = num_heads + self.group_size = group_size + self.lsda_flag = lsda_flag + self.mlp_ratio = mlp_ratio + self.num_patch_size = num_patch_size + if min(self.input_resolution) <= self.group_size: + # if group size is larger than input resolution, we don't partition groups + self.lsda_flag = 0 + self.group_size = min(self.input_resolution) + + self.norm1 = norm_layer(dim) + + self.attn = Attention( + dim, group_size=to_2tuple(self.group_size), num_heads=num_heads, + qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, + position_bias=True) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + attn_mask = None + self.register_buffer("attn_mask", attn_mask) + + def forward(self, x): + H, W = self.input_resolution + B, L, C = x.shape + assert L == H * W, "input feature has wrong size %d, %d, %d" % (L, H, W) + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # group embeddings + G = self.group_size + if self.lsda_flag == 0: # 0 for SDA + x = x.reshape(B, H // G, G, W // G, G, C).permute(0, 1, 3, 2, 4, 5) + else: # 1 for LDA + x = x.reshape(B, G, H // G, G, W // G, C).permute(0, 2, 4, 1, 3, 5) + x = x.reshape(B * H * W // G**2, G**2, C) + + # multi-head self-attention + x = self.attn(x, mask=self.attn_mask) # nW*B, G*G, C + + # ungroup embeddings + x = x.reshape(B, H // G, W // G, G, G, C) + if self.lsda_flag == 0: + x = x.permute(0, 1, 3, 2, 4, 5).reshape(B, H, W, C) + else: + x = x.permute(0, 3, 1, 4, 2, 5).reshape(B, H, W, C) + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \ + f"group_size={self.group_size}, lsda_flag={self.lsda_flag}, mlp_ratio={self.mlp_ratio}" + + def flops(self): + flops = 0 + H, W = self.input_resolution + # norm1 + flops += self.dim * H * W + # LSDA + nW = H * W / self.group_size / self.group_size + flops += nW * self.attn.flops(self.group_size * self.group_size) + # mlp + flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio + # norm2 + flops += self.dim * H * W + return flops + +class PatchMerging(nn.Module): + r""" Patch Merging Layer. + + Args: + input_resolution (tuple[int]): Resolution of input feature. + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm, patch_size=[2], num_input_patch_size=1): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reductions = nn.ModuleList() + self.patch_size = patch_size + self.norm = norm_layer(dim) + + for i, ps in enumerate(patch_size): + if i == len(patch_size) - 1: + out_dim = 2 * dim // 2 ** i + else: + out_dim = 2 * dim // 2 ** (i + 1) + stride = 2 + padding = (ps - stride) // 2 + self.reductions.append(nn.Conv2d(dim, out_dim, kernel_size=ps, + stride=stride, padding=padding)) + + def forward(self, x): + """ + x: B, H*W, C + """ + H, W = self.input_resolution + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even." + + x = self.norm(x) + x = x.view(B, H, W, C).permute(0, 3, 1, 2) + + xs = [] + for i in range(len(self.reductions)): + tmp_x = self.reductions[i](x).flatten(2).transpose(1, 2) + xs.append(tmp_x) + x = torch.cat(xs, dim=2) + return x + + def extra_repr(self) -> str: + return f"input_resolution={self.input_resolution}, dim={self.dim}" + + def flops(self): + H, W = self.input_resolution + flops = H * W * self.dim + for i, ps in enumerate(self.patch_size): + if i == len(self.patch_size) - 1: + out_dim = 2 * self.dim // 2 ** i + else: + out_dim = 2 * self.dim // 2 ** (i + 1) + flops += (H // 2) * (W // 2) * ps * ps * out_dim * self.dim + return flops + + +class Stage(nn.Module): + """ CrossFormer blocks for one stage. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + group_size (int): variable G in the paper, one group has GxG embeddings + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__(self, dim, input_resolution, depth, num_heads, group_size, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False, + patch_size_end=[4], num_patch_size=None): + + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList() + for i in range(depth): + lsda_flag = 0 if (i % 2 == 0) else 1 + self.blocks.append(CrossFormerBlock(dim=dim, input_resolution=input_resolution, + num_heads=num_heads, group_size=group_size, + lsda_flag=lsda_flag, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop, attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer, + num_patch_size=num_patch_size)) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer, + patch_size=patch_size_end, num_input_patch_size=num_patch_size) + else: + self.downsample = None + + def forward(self, x): + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + if self.downsample is not None: + x = self.downsample(x) + return x + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" + + def flops(self): + flops = 0 + for blk in self.blocks: + flops += blk.flops() + if self.downsample is not None: + flops += self.downsample.flops() + return flops + + +class PatchEmbed(nn.Module): + r""" Image to Patch Embedding + + Args: + img_size (int): Image size. Default: 224. + patch_size (int): Patch token size. Default: [4]. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, img_size=224, patch_size=[4], in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + img_size = to_2tuple(img_size) + # patch_size = to_2tuple(patch_size) + patches_resolution = [img_size[0] // patch_size[0], img_size[0] // patch_size[0]] + self.img_size = img_size + self.patch_size = patch_size + self.patches_resolution = patches_resolution + self.num_patches = patches_resolution[0] * patches_resolution[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.projs = nn.ModuleList() + for i, ps in enumerate(patch_size): + if i == len(patch_size) - 1: + dim = embed_dim // 2 ** i + else: + dim = embed_dim // 2 ** (i + 1) + stride = patch_size[0] + padding = (ps - patch_size[0]) // 2 + self.projs.append(nn.Conv2d(in_chans, dim, kernel_size=ps, stride=stride, padding=padding)) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + xs = [] + for i in range(len(self.projs)): + tx = self.projs[i](x).flatten(2).transpose(1, 2) + xs.append(tx) # B Ph*Pw C + x = torch.cat(xs, dim=2) + if self.norm is not None: + x = self.norm(x) + return x + + def flops(self): + Ho, Wo = self.patches_resolution + flops = 0 + for i, ps in enumerate(self.patch_size): + if i == len(self.patch_size) - 1: + dim = self.embed_dim // 2 ** i + else: + dim = self.embed_dim // 2 ** (i + 1) + flops += Ho * Wo * dim * self.in_chans * (self.patch_size[i] * self.patch_size[i]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops + + +class CrossFormer(nn.Module): + r""" CrossFormer + A PyTorch impl of : `CrossFormer: A Versatile Vision Transformer Based on Cross-scale Attention` - + + Args: + img_size (int | tuple(int)): Input image size. Default 224 + patch_size (int | tuple(int)): Patch size. Default: 4 + in_chans (int): Number of input image channels. Default: 3 + num_classes (int): Number of classes for classification head. Default: 1000 + embed_dim (int): Patch embedding dimension. Default: 96 + depths (tuple(int)): Depth of each stage. + num_heads (tuple(int)): Number of attention heads in different layers. + group_size (int): Group size. Default: 7 + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None + drop_rate (float): Dropout rate. Default: 0 + attn_drop_rate (float): Attention dropout rate. Default: 0 + drop_path_rate (float): Stochastic depth rate. Default: 0.1 + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False + patch_norm (bool): If True, add normalization after patch embedding. Default: True + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False + """ + + def __init__(self, img_size=224, patch_size=[4], in_chans=3, num_classes=1000, + embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], + group_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, + norm_layer=nn.LayerNorm, ape=False, patch_norm=True, + use_checkpoint=False, merge_size=[[2], [2], [2]], **kwargs): + super().__init__() + + self.num_classes = num_classes + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) + self.mlp_ratio = mlp_ratio + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None) + num_patches = self.patch_embed.num_patches + patches_resolution = self.patch_embed.patches_resolution + self.patches_resolution = patches_resolution + + # absolute position embedding + if self.ape: + self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) + trunc_normal_(self.absolute_pos_embed, std=.02) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + + num_patch_sizes = [len(patch_size)] + [len(m) for m in merge_size] + for i_layer in range(self.num_layers): + patch_size_end = merge_size[i_layer] if i_layer < self.num_layers - 1 else None + num_patch_size = num_patch_sizes[i_layer] + layer = Stage(dim=int(embed_dim * 2 ** i_layer), + input_resolution=(patches_resolution[0] // (2 ** i_layer), + patches_resolution[1] // (2 ** i_layer)), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + group_size=group_size[i_layer], + mlp_ratio=self.mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + use_checkpoint=use_checkpoint, + patch_size_end=patch_size_end, + num_patch_size=num_patch_size) + self.layers.append(layer) + + self.norm = norm_layer(self.num_features) + self.avgpool = nn.AdaptiveAvgPool1d(1) + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'absolute_pos_embed'} + + @torch.jit.ignore + def no_weight_decay_keywords(self): + return {'relative_position_bias_table'} + + def forward_features(self, x): + x = self.patch_embed(x) + if self.ape: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + + for layer in self.layers: + x = layer(x) + + x = self.norm(x) # B L C + x = self.avgpool(x.transpose(1, 2)) # B C 1 + x = torch.flatten(x, 1) + return x + + def forward(self, x): + x = self.forward_features(x) + x = self.head(x) + return x + + def flops(self): + flops = 0 + flops += self.patch_embed.flops() + for i, layer in enumerate(self.layers): + flops += layer.flops() + flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers) + flops += self.num_features * self.num_classes + return flops + + +class cross_former_cls_head_warp(nn.Module): + def __init__(self, backbone, num_classes): + super().__init__() + embed_dim = 96 + depths = [2, 2, 18, 2] + num_layers = len(depths) + num_features = int(embed_dim * 2 ** (num_layers - 1)) + self.backbone = backbone + self.head = nn.Linear(num_features, num_classes) + + def forward(self, x): + x = self.backbone(x) + x = self.head(x) + return x \ No newline at end of file diff --git a/PuzzleTuning/Backbone/counterpart_models/crossformer_backbone.py b/PuzzleTuning/Backbone/counterpart_models/crossformer_backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..a05ec04199153559410ed407504aa4259d9ea59c --- /dev/null +++ b/PuzzleTuning/Backbone/counterpart_models/crossformer_backbone.py @@ -0,0 +1,659 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ +from mmdet.utils import get_root_logger +from mmcv.runner import load_checkpoint + +NEG_INF = -1000000 + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class DynamicPosBias(nn.Module): + def __init__(self, dim, num_heads, residual): + super().__init__() + self.residual = residual + self.num_heads = num_heads + self.pos_dim = dim // 4 + self.pos_proj = nn.Linear(2, self.pos_dim) + self.pos1 = nn.Sequential( + nn.LayerNorm(self.pos_dim), + nn.ReLU(inplace=True), + nn.Linear(self.pos_dim, self.pos_dim), + ) + self.pos2 = nn.Sequential( + nn.LayerNorm(self.pos_dim), + nn.ReLU(inplace=True), + nn.Linear(self.pos_dim, self.pos_dim) + ) + self.pos3 = nn.Sequential( + nn.LayerNorm(self.pos_dim), + nn.ReLU(inplace=True), + nn.Linear(self.pos_dim, self.num_heads) + ) + def forward(self, biases): + if self.residual: + pos = self.pos_proj(biases) # 2Gh-1 * 2Gw-1, heads + pos = pos + self.pos1(pos) + pos = pos + self.pos2(pos) + pos = self.pos3(pos) + else: + pos = self.pos3(self.pos2(self.pos1(self.pos_proj(biases)))) + return pos + + def flops(self, N): + flops = N * 2 * self.pos_dim + flops += N * self.pos_dim * self.pos_dim + flops += N * self.pos_dim * self.pos_dim + flops += N * self.pos_dim * self.num_heads + return flops + +class Attention(nn.Module): + r""" Multi-head self attention module with relative position bias. + + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__(self, dim, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0., + position_bias=True): + + super().__init__() + self.dim = dim + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + self.position_bias = position_bias + if self.position_bias: + self.pos = DynamicPosBias(self.dim // 4, self.num_heads, residual=False) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, H, W, mask=None): + """ + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Gh*Gw, Gh*Gw) or None + """ + group_size = (H, W) + B_, N, C = x.shape + assert H*W == N + qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4).contiguous() + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) # (num_windows*B, N, N), N = Gh*Gw + + if self.position_bias: + # generate mother-set + position_bias_h = torch.arange(1 - group_size[0], group_size[0], device=attn.device) + position_bias_w = torch.arange(1 - group_size[1], group_size[1], device=attn.device) + biases = torch.stack(torch.meshgrid([position_bias_h, position_bias_w])) # 2, 2Gh-1, 2W2-1 + biases = biases.flatten(1).transpose(0, 1).contiguous().float() + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(group_size[0], device=attn.device) + coords_w = torch.arange(group_size[1], device=attn.device) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Gh, Gw + coords_flatten = torch.flatten(coords, 1) # 2, Gh*Gw + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Gh*Gw, Gh*Gw + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Gh*Gw, Gh*Gw, 2 + relative_coords[:, :, 0] += group_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += group_size[1] - 1 + relative_coords[:, :, 0] *= 2 * group_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Gh*Gw, Gh*Gw + + pos = self.pos(biases) # 2Gh-1 * 2Gw-1, heads + # select position bias + relative_position_bias = pos[relative_position_index.view(-1)].view( + group_size[0] * group_size[1], group_size[0] * group_size[1], -1) # Gh*Gw,Gh*Gw,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Gh*Gw, Gh*Gw + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nG = mask.shape[0] + attn = attn.view(B_ // nG, nG, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) # (B, nG, nHead, N, N) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + def extra_repr(self) -> str: + return f'dim={self.dim}, num_heads={self.num_heads}' + + def flops(self, N): + # calculate flops for 1 window with token length of N + flops = 0 + excluded_flops = 0 + # qkv = self.qkv(x) + flops += N * self.dim * 3 * self.dim + # attn = (q @ k.transpose(-2, -1)) + flops += self.num_heads * N * (self.dim // self.num_heads) * N + excluded_flops += self.num_heads * N * (self.dim // self.num_heads) * N + # x = (attn @ v) + flops += self.num_heads * N * N * (self.dim // self.num_heads) + excluded_flops += self.num_heads * N * N * (self.dim // self.num_heads) + # x = self.proj(x) + flops += N * self.dim * self.dim + if self.position_bias: + flops += self.pos.flops(N) + return flops, excluded_flops + + +class CrossFormerBlock(nn.Module): + r""" CrossFormer Block. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resulotion. + num_heads (int): Number of attention heads. + group_size (int): Window size. + lsda_flag (int): use SDA or LDA, 0 for SDA and 1 for LDA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, input_resolution, num_heads, group_size=7, interval=8, lsda_flag=0, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., + act_layer=nn.GELU, norm_layer=nn.LayerNorm, num_patch_size=1): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.num_heads = num_heads + self.group_size = group_size + self.interval = interval + self.lsda_flag = lsda_flag + self.mlp_ratio = mlp_ratio + self.num_patch_size = num_patch_size + + self.norm1 = norm_layer(dim) + + self.attn = Attention( + dim, num_heads=num_heads, + qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, + position_bias=True) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x, H, W): + B, L, C = x.shape + assert L == H * W, "input feature has wrong size %d, %d, %d" % (L, H, W) + + if min(H, W) <= self.group_size: + # if window size is larger than input resolution, we don't partition windows + self.lsda_flag = 0 + self.group_size = min(H, W) + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # padding + size_div = self.interval if self.lsda_flag == 1 else self.group_size + pad_l = pad_t = 0 + pad_r = (size_div - W % size_div) % size_div + pad_b = (size_div - H % size_div) % size_div + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) + _, Hp, Wp, _ = x.shape + + mask = torch.zeros((1, Hp, Wp, 1), device=x.device) + if pad_b > 0: + mask[:, -pad_b:, :, :] = -1 + if pad_r > 0: + mask[:, :, -pad_r:, :] = -1 + + # group embeddings and generate attn_mask + if self.lsda_flag == 0: # SDA + G = Gh = Gw = self.group_size + x = x.reshape(B, Hp // G, G, Wp // G, G, C).permute(0, 1, 3, 2, 4, 5).contiguous() + x = x.reshape(B * Hp * Wp // G**2, G**2, C) + nG = Hp * Wp // G**2 + # attn_mask + if pad_r > 0 or pad_b > 0: + mask = mask.reshape(1, Hp // G, G, Wp // G, G, 1).permute(0, 1, 3, 2, 4, 5).contiguous() + mask = mask.reshape(nG, 1, G * G) + attn_mask = torch.zeros((nG, G * G, G * G), device=x.device) + attn_mask = attn_mask.masked_fill(mask < 0, NEG_INF) + else: + attn_mask = None + else: # LDA + I, Gh, Gw = self.interval, Hp // self.interval, Wp // self.interval + x = x.reshape(B, Gh, I, Gw, I, C).permute(0, 2, 4, 1, 3, 5).contiguous() + x = x.reshape(B * I * I, Gh * Gw, C) + nG = I ** 2 + # attn_mask + if pad_r > 0 or pad_b > 0: + mask = mask.reshape(1, Gh, I, Gw, I, 1).permute(0, 2, 4, 1, 3, 5).contiguous() + mask = mask.reshape(nG, 1, Gh * Gw) + attn_mask = torch.zeros((nG, Gh * Gw, Gh * Gw), device=x.device) + attn_mask = attn_mask.masked_fill(mask < 0, NEG_INF) + else: + attn_mask = None + + # multi-head self-attention + x = self.attn(x, Gh, Gw, mask=attn_mask) # nG*B, G*G, C + + # ungroup embeddings + if self.lsda_flag == 0: + x = x.reshape(B, Hp // G, Wp // G, G, G, C).permute(0, 1, 3, 2, 4, 5).contiguous() # B, Hp//G, G, Wp//G, G, C + else: + x = x.reshape(B, I, I, Gh, Gw, C).permute(0, 3, 1, 4, 2, 5).contiguous() # B, Gh, I, Gw, I, C + x = x.reshape(B, Hp, Wp, C) + + # remove padding + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :].contiguous() + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \ + f"group_size={self.group_size}, lsda_flag={self.lsda_flag}, mlp_ratio={self.mlp_ratio}" + + def flops(self): + flops = 0 + H, W = self.input_resolution + # norm1 + flops += self.dim * H * W + # Attention + size_div = self.interval if self.lsda_flag == 1 else self.group_size + Hp = math.ceil(H / size_div) * size_div + Wp = math.ceil(W / size_div) * size_div + Gh = Hp / size_div if self.lsda_flag == 1 else self.group_size + Gw = Wp / size_div if self.lsda_flag == 1 else self.group_size + nG = Hp * Wp / Gh / Gw + attn_flops, attn_excluded_flops = self.attn.flops(Gh * Gw) + flops += nG * attn_flops + excluded_flops = nG * attn_excluded_flops + # mlp + flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio + # norm2 + flops += self.dim * H * W + return flops, excluded_flops + +class PatchMerging(nn.Module): + r""" Patch Merging Layer. + + Args: + input_resolution (tuple[int]): Resolution of input feature. + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm, patch_size=[2], num_input_patch_size=1): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reductions = nn.ModuleList() + self.patch_size = patch_size + self.norm = norm_layer(dim) + + for i, ps in enumerate(patch_size): + if i == len(patch_size) - 1: + out_dim = 2 * dim // 2 ** i + else: + out_dim = 2 * dim // 2 ** (i + 1) + stride = 2 + padding = (ps - stride) // 2 + self.reductions.append(nn.Conv2d(dim, out_dim, kernel_size=ps, + stride=stride, padding=padding)) + + def forward(self, x, H, W): + """ + x: B, H*W, C + """ + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even." + + x = self.norm(x) + x = x.view(B, H, W, C).permute(0, 3, 1, 2).contiguous() + + xs = [] + for i in range(len(self.reductions)): + tmp_x = self.reductions[i](x).flatten(2).transpose(1, 2).contiguous() + xs.append(tmp_x) + x = torch.cat(xs, dim=2) + return x + + def extra_repr(self) -> str: + return f"input_resolution={self.input_resolution}, dim={self.dim}" + + def flops(self): + H, W = self.input_resolution + flops = H * W * self.dim + for i, ps in enumerate(self.patch_size): + if i == len(self.patch_size) - 1: + out_dim = 2 * self.dim // 2 ** i + else: + out_dim = 2 * self.dim // 2 ** (i + 1) + flops += (H // 2) * (W // 2) * ps * ps * out_dim * self.dim + return flops + + +class Stage(nn.Module): + """ CrossFormer blocks for one stage. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + group_size (int): Group size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Ghether to use checkpointing to save memory. Default: False. + """ + + def __init__(self, dim, input_resolution, depth, num_heads, group_size, interval, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False, + patch_size_end=[4], num_patch_size=None): + + super().__init__() + self.dim = dim + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList() + for i in range(depth): + lsda_flag = 0 if (i % 2 == 0) else 1 + self.blocks.append(CrossFormerBlock(dim=dim, input_resolution=input_resolution, + num_heads=num_heads, group_size=group_size, interval=interval, + lsda_flag=lsda_flag, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop, attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer, + num_patch_size=num_patch_size)) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer, + patch_size=patch_size_end, num_input_patch_size=num_patch_size) + else: + self.downsample = None + + def forward(self, x, H, W): + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x, H, W) + + B, _, C = x.shape + feat = x.view(B, H, W, C).permute(0, 3, 1, 2).contiguous() + if self.downsample is not None: + x = self.downsample(x, H, W) + return feat, x + + def extra_repr(self) -> str: + return f"dim={self.dim}, depth={self.depth}" + + def flops(self): + flops = 0 + excluded_flops = 0 + for blk in self.blocks: + blk_flops, blk_excluded_flops = blk.flops() + flops += blk_flops + excluded_flops += blk_excluded_flops + if self.downsample is not None: + flops += self.downsample.flops() + return flops, excluded_flops + + +class PatchEmbed(nn.Module): + r""" Image to Patch Embedding + + Args: + img_size (int): Image size. Default: 224. + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, img_size=224, patch_size=[4], in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + img_size = to_2tuple(img_size) + # patch_size = to_2tuple(patch_size) + patches_resolution = [img_size[0] // 4, img_size[1] // 4] # only for flops calculation + self.img_size = img_size + self.patch_size = patch_size + self.patches_resolution = patches_resolution + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.projs = nn.ModuleList() + for i, ps in enumerate(patch_size): + if i == len(patch_size) - 1: + dim = embed_dim // 2 ** i + else: + dim = embed_dim // 2 ** (i + 1) + stride = 4 + padding = (ps - 4) // 2 + self.projs.append(nn.Conv2d(in_chans, dim, kernel_size=ps, stride=stride, padding=padding)) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + B, C, H, W = x.shape + xs = [] + for i in range(len(self.projs)): + tx = self.projs[i](x).flatten(2).transpose(1, 2) + xs.append(tx) # B Ph*Pw C + x = torch.cat(xs, dim=2) + if self.norm is not None: + x = self.norm(x) + return x, H, W + + def flops(self): + Ho, Wo = self.patches_resolution + flops = 0 + for i, ps in enumerate(self.patch_size): + if i == len(self.patch_size) - 1: + dim = self.embed_dim // 2 ** i + else: + dim = self.embed_dim // 2 ** (i + 1) + flops += Ho * Wo * dim * self.in_chans * (self.patch_size[i] * self.patch_size[i]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops + + +class CrossFormer(nn.Module): + r""" CrossFormer + A PyTorch impl of : `CrossFormer: A Versatile Vision Transformer Based on Cross-scale Attention` - + + Args: + img_size (int | tuple(int)): Input image size. Default 224 + patch_size (int | tuple(int)): Patch size. Default: 4 + in_chans (int): Number of input image channels. Default: 3 + num_classes (int): Number of classes for classification head. Default: 1000 + embed_dim (int): Patch embedding dimension. Default: 96 + depths (tuple(int)): Depth of each stage. + num_heads (tuple(int)): Number of attention heads in different layers. + group_size (int): Group size. Default: 7 + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None + drop_rate (float): Dropout rate. Default: 0 + attn_drop_rate (float): Attention dropout rate. Default: 0 + drop_path_rate (float): Stochastic depth rate. Default: 0.1 + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False + patch_norm (bool): If True, add normalization after patch embedding. Default: True + use_checkpoint (bool): Ghether to use checkpointing to save memory. Default: False + """ + + def __init__(self, img_size=224, patch_size=[4], in_chans=3, num_classes=1000, + embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], + group_size=7, crs_interval=[8, 4, 2, 1], mlp_ratio=4., qkv_bias=True, qk_scale=None, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, + norm_layer=nn.LayerNorm, patch_norm=True, + use_checkpoint=False, merge_size=[[2], [2], [2]], **kwargs): + super().__init__() + + self.num_classes = num_classes + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.patch_norm = patch_norm + self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) + self.mlp_ratio = mlp_ratio + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None) + patches_resolution = self.patch_embed.patches_resolution + self.patches_resolution = patches_resolution # [H//4, W//4] of original image size + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + + num_patch_sizes = [len(patch_size)] + [len(m) for m in merge_size] + for i_layer in range(self.num_layers): + patch_size_end = merge_size[i_layer] if i_layer < self.num_layers - 1 else None + num_patch_size = num_patch_sizes[i_layer] + layer = Stage(dim=int(embed_dim * 2 ** i_layer), + input_resolution=(patches_resolution[0] // (2 ** i_layer), + patches_resolution[1] // (2 ** i_layer)), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + group_size=group_size[i_layer], + interval=crs_interval[i_layer], + mlp_ratio=self.mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + use_checkpoint=use_checkpoint, + patch_size_end=patch_size_end, + num_patch_size=num_patch_size) + self.layers.append(layer) + + # # classification + # self.norm = norm_layer(self.num_features) + # self.avgpool = nn.AdaptiveAvgPool1d(1) + # self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + self.apply(self._init_weights) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'absolute_pos_embed'} + + @torch.jit.ignore + def no_weight_decay_keywords(self): + return {'relative_position_bias_table'} + + def forward(self, x): + x, H, W = self.patch_embed(x) + x = self.pos_drop(x) + + outs = [] + for i, layer in enumerate(self.layers): + feat, x = layer(x, H //4 //(2 ** i), W //4 //(2 ** i)) + outs.append(feat) + + # # classification + # x = self.norm(x) # B L C + # x = self.avgpool(x.transpose(1, 2)) # B C 1 + # x = torch.flatten(x, 1) + # x = self.head(x) + # return x + + return outs + + def flops(self): + flops = 0 + excluded_flops = 0 + flops += self.patch_embed.flops() + for i, layer in enumerate(self.layers): + layer_flops, layer_excluded_flops = layer.flops() + flops += layer_flops + excluded_flops += layer_excluded_flops + # flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers) + # flops += self.num_features * self.num_classes + return flops, excluded_flops diff --git a/PuzzleTuning/Backbone/getmodel.py b/PuzzleTuning/Backbone/getmodel.py new file mode 100644 index 0000000000000000000000000000000000000000..3faf89ff69d89aba3604fa646743be9aa04e236f --- /dev/null +++ b/PuzzleTuning/Backbone/getmodel.py @@ -0,0 +1,392 @@ +""" +get model func Script ver: Dec 5th 14:20 +""" +import os +import sys +sys.path.append(os.path.realpath('.')) + +import torch +import torch.nn as nn +from torchvision import models +from Backbone import ResHybrid + + +# get model +def get_model(num_classes=1000, edge_size=224, model_idx=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, + pretrained_backbone=True, use_cls_token=True, use_pos_embedding=True, use_att_module='SimAM'): + """ + :param num_classes: classification required number of your dataset + :param edge_size: the input edge size of the dataloder + :param model_idx: the model we are going to use. by the format of Model_size_other_info + + :param drop_rate: The dropout layer's probility of proposed models + :param attn_drop_rate: The dropout layer(right after the MHSA block or MHGA block)'s probility of proposed models + :param drop_path_rate: The probility of stochastic depth + + :param pretrained_backbone: The backbone CNN is initiate randomly or by its official Pretrained models + + :param use_cls_token: To use the class token + :param use_pos_embedding: To use the positional enbedding + :param use_att_module: To use which attention module in the FGD Focus block + + :return: prepared model + """ + if model_idx[0:5] == 'ViT_h': + # Transfer learning for ViT + import timm + from pprint import pprint + model_names = timm.list_models('*vit*') + pprint(model_names) + if edge_size == 224: + model = timm.create_model('vit_huge_patch14_224_in21k', pretrained=pretrained_backbone, num_classes=num_classes) + else: + print('not a avaliable image size with', model_idx) + + elif model_idx[0:5] == 'ViT_l': + # Transfer learning for ViT + import timm + from pprint import pprint + model_names = timm.list_models('*vit*') + pprint(model_names) + if edge_size == 224: + model = timm.create_model('vit_large_patch16_224', pretrained=pretrained_backbone, num_classes=num_classes) + elif edge_size == 384: + model = timm.create_model('vit_large_patch16_384', pretrained=pretrained_backbone, num_classes=num_classes) + else: + print('not a avaliable image size with', model_idx) + + elif model_idx[0:5] == 'ViT_s': + # Transfer learning for ViT + import timm + from pprint import pprint + model_names = timm.list_models('*vit*') + pprint(model_names) + if edge_size == 224: + model = timm.create_model('vit_small_patch16_224', pretrained=pretrained_backbone, num_classes=num_classes) + elif edge_size == 384: + model = timm.create_model('vit_small_patch16_384', pretrained=pretrained_backbone, num_classes=num_classes) + else: + print('not a avaliable image size with', model_idx) + + elif model_idx[0:5] == 'ViT_t': + # Transfer learning for ViT + import timm + from pprint import pprint + model_names = timm.list_models('*vit*') + pprint(model_names) + if edge_size == 224: + model = timm.create_model('vit_tiny_patch16_224', pretrained=pretrained_backbone, num_classes=num_classes) + elif edge_size == 384: + model = timm.create_model('vit_tiny_patch16_384', pretrained=pretrained_backbone, num_classes=num_classes) + else: + print('not a avaliable image size with', model_idx) + + elif model_idx[0:5] == 'ViT_b' or model_idx[0:3] == 'ViT': # vit_base + # Transfer learning for ViT + import timm + from pprint import pprint + model_names = timm.list_models('*vit*') + pprint(model_names) + if edge_size == 224: + model = timm.create_model('vit_base_patch16_224', pretrained=pretrained_backbone, num_classes=num_classes) + elif edge_size == 384: + model = timm.create_model('vit_base_patch16_384', pretrained=pretrained_backbone, num_classes=num_classes) + else: + print('not a avaliable image size with', model_idx) + + elif model_idx[0:3] == 'vgg': + # Transfer learning for vgg16_bn + import timm + from pprint import pprint + model_names = timm.list_models('*vgg*') + pprint(model_names) + if model_idx[0:8] == 'vgg16_bn': + model = timm.create_model('vgg16_bn', pretrained=pretrained_backbone, num_classes=num_classes) + elif model_idx[0:5] == 'vgg16': + model = timm.create_model('vgg16', pretrained=pretrained_backbone, num_classes=num_classes) + elif model_idx[0:8] == 'vgg19_bn': + model = timm.create_model('vgg19_bn', pretrained=pretrained_backbone, num_classes=num_classes) + elif model_idx[0:5] == 'vgg19': + model = timm.create_model('vgg19', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:4] == 'deit': # Transfer learning for DeiT + import timm + from pprint import pprint + model_names = timm.list_models('*deit*') + pprint(model_names) + if edge_size == 384: + model = timm.create_model('deit_base_patch16_384', pretrained=pretrained_backbone, num_classes=2) + elif edge_size == 224: + model = timm.create_model('deit_base_patch16_224', pretrained=pretrained_backbone, num_classes=2) + else: + pass + + elif model_idx[0:5] == 'twins': # Transfer learning for twins + import timm + from pprint import pprint + + model_names = timm.list_models('*twins*') + pprint(model_names) + model = timm.create_model('twins_pcpvt_base', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:5] == 'pit_b' and edge_size == 224: # Transfer learning for PiT + import timm + from pprint import pprint + + model_names = timm.list_models('*pit*') + pprint(model_names) + model = timm.create_model('pit_b_224', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:5] == 'gcvit' and edge_size == 224: # Transfer learning for gcvit + import timm + from pprint import pprint + + model_names = timm.list_models('*gcvit*') + pprint(model_names) + model = timm.create_model('gcvit_base', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:6] == 'xcit_s': # Transfer learning for XCiT + import timm + from pprint import pprint + model_names = timm.list_models('*xcit*') + pprint(model_names) + if edge_size == 384: + model = timm.create_model('xcit_small_12_p16_384_dist', pretrained=pretrained_backbone, + num_classes=num_classes) + elif edge_size == 224: + model = timm.create_model('xcit_small_12_p16_224_dist', pretrained=pretrained_backbone, + num_classes=num_classes) + else: + pass + + elif model_idx[0:6] == 'xcit_m': # Transfer learning for XCiT + import timm + from pprint import pprint + model_names = timm.list_models('*xcit*') + pprint(model_names) + if edge_size == 384: + model = timm.create_model('xcit_medium_24_p16_384_dist', pretrained=pretrained_backbone, + num_classes=num_classes) + elif edge_size == 224: + model = timm.create_model('xcit_medium_24_p16_224_dist', pretrained=pretrained_backbone, + num_classes=num_classes) + else: + pass + + elif model_idx[0:6] == 'mvitv2': # Transfer learning for MViT v2 small fixme bug in model! + import timm + from pprint import pprint + model_names = timm.list_models('*mvitv2*') + pprint(model_names) + model = timm.create_model('mvitv2_small_cls', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:6] == 'convit' and edge_size == 224: # Transfer learning for ConViT fixme bug in model! + import timm + from pprint import pprint + + model_names = timm.list_models('*convit*') + pprint(model_names) + model = timm.create_model('convit_base', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:6] == 'ResNet': # Transfer learning for the ResNets + if model_idx[0:8] == 'ResNet34': + model = models.resnet34(pretrained=pretrained_backbone) + elif model_idx[0:8] == 'ResNet50': + model = models.resnet50(pretrained=pretrained_backbone) + elif model_idx[0:9] == 'ResNet101': + model = models.resnet101(pretrained=pretrained_backbone) + else: + print('this model is not defined in get model') + return -1 + num_ftrs = model.fc.in_features + model.fc = nn.Linear(num_ftrs, num_classes) + + elif model_idx[0:6] == 'Backbone': # ours: MSHT + # NOTICE: HERE 'pretrained' controls only The backbone CNN is initiate randomly + # or by its official Pretrained models + model = ResHybrid.create_model(model_idx, edge_size, pretrained=pretrained_backbone, num_classes=num_classes, + drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, + drop_path_rate=drop_path_rate, use_cls_token=use_cls_token, + use_pos_embedding=use_pos_embedding, use_att_module=use_att_module) + + elif model_idx[0:7] == 'bot_256' and edge_size == 256: # Model: BoT + import timm + from pprint import pprint + model_names = timm.list_models('*bot*') + pprint(model_names) + # NOTICE: we find no weight for BoT in timm + # ['botnet26t_256', 'botnet50ts_256', 'eca_botnext26ts_256'] + model = timm.create_model('botnet26t_256', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:8] == 'densenet': # Transfer learning for densenet + import timm + from pprint import pprint + + model_names = timm.list_models('*densenet*') + pprint(model_names) + model = timm.create_model('densenet121', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:8] == 'xception': # Transfer learning for Xception + import timm + from pprint import pprint + model_names = timm.list_models('*xception*') + pprint(model_names) + model = timm.create_model('xception', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:9] == 'pvt_v2_b0': # Transfer learning for PVT v2 (todo not okey with torch summary) + import timm + from pprint import pprint + model_names = timm.list_models('*pvt_v2*') + pprint(model_names) + model = timm.create_model('pvt_v2_b0', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:9] == 'visformer' and edge_size == 224: # Transfer learning for Visformer + import timm + from pprint import pprint + model_names = timm.list_models('*visformer*') + pprint(model_names) + model = timm.create_model('visformer_small', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:9] == 'conformer': # Transfer learning for Conformer base + from Backbone.counterpart_models import conformer + + embed_dim = 576 + channel_ratio = 6 + + if pretrained_backbone: + model = conformer.Conformer(num_classes=1000, patch_size=16, channel_ratio=channel_ratio, + embed_dim=embed_dim, depth=12, num_heads=9, mlp_ratio=4, qkv_bias=True) + # this is the related path to , not + save_model_path = '../saved_models/Conformer_base_patch16.pth' # fixme model is downloaded at this path + # downloaded from official model state at https://github.com/pengzhiliang/Conformer + model.load_state_dict(torch.load(save_model_path), False) + + model.trans_cls_head = nn.Linear(embed_dim, num_classes) + model.conv_cls_head = nn.Linear(int(256 * channel_ratio), num_classes) + model.cls_head = nn.Linear(int(2 * num_classes), num_classes) + + else: + model = conformer.Conformer(num_classes=num_classes, patch_size=16, channel_ratio=channel_ratio, + embed_dim=embed_dim, depth=12, num_heads=9, mlp_ratio=4, qkv_bias=True) + + elif model_idx[0:9] == 'coat_mini' and edge_size == 224: # Transfer learning for coat_mini + import timm + from pprint import pprint + + model_names = timm.list_models('*coat*') + pprint(model_names) + model = timm.create_model('coat_mini', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:10] == 'swin_b_384' and edge_size == 384: # Transfer learning for Swin Transformer (swin_b_384) + import timm + from pprint import pprint + model_names = timm.list_models('*swin*') + pprint(model_names) # swin_base_patch4_window12_384 swin_base_patch4_window12_384_in22k + model = timm.create_model('swin_base_patch4_window12_384', pretrained=pretrained_backbone, + num_classes=num_classes) + + elif model_idx[0:10] == 'swin_b_224' and edge_size == 224: # Transfer learning for Swin Transformer (swin_b_384) + import timm + from pprint import pprint + model_names = timm.list_models('*swin*') + pprint(model_names) # swin_base_patch4_window7_224 swin_base_patch4_window7_224_in22k + model = timm.create_model('swin_base_patch4_window7_224', pretrained=pretrained_backbone, + num_classes=num_classes) + + elif model_idx[0:11] == 'mobilenetv3': # Transfer learning for mobilenetv3 + import timm + from pprint import pprint + model_names = timm.list_models('*mobilenet*') + pprint(model_names) + model = timm.create_model('mobilenetv3_large_100', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:11] == 'mobilevit_s': # Transfer learning for mobilevit_s + import timm + from pprint import pprint + model_names = timm.list_models('*mobilevit*') + pprint(model_names) + model = timm.create_model('mobilevit_s', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:11] == 'inceptionv3': # Transfer learning for Inception v3 + import timm + from pprint import pprint + model_names = timm.list_models('*inception*') + pprint(model_names) + model = timm.create_model('inception_v3', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:12] == 'cross_former' and edge_size == 224: # Transfer learning for crossformer base + from Backbone.counterpart_models import crossformer + backbone = crossformer.CrossFormer(img_size=edge_size, + patch_size=[4, 8, 16, 32], + in_chans=3, + num_classes=0, # get backbone only + embed_dim=96, + depths=[2, 2, 18, 2], + num_heads=[3, 6, 12, 24], + group_size=[7, 7, 7, 7], + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + drop_path_rate=0.3, + ape=False, + patch_norm=True, + use_checkpoint=False, + merge_size=[[2, 4], [2, 4], [2, 4]], ) + if pretrained_backbone: + save_model_path = '../saved_models/crossformer-b.pth' # fixme model is downloaded at this path + # downloaded from official model state at https://github.com/cheerss/CrossFormer + backbone.load_state_dict(torch.load(save_model_path)['model'], False) + model = crossformer.cross_former_cls_head_warp(backbone, num_classes) + + elif model_idx[0:13] == 'crossvit_base': # Transfer learning for crossvit_base (todo not okey with torch summary) + import timm + from pprint import pprint + model_names = timm.list_models('*crossvit_base*') + pprint(model_names) + model = timm.create_model('crossvit_base_240', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:14] == 'efficientnet_b': # Transfer learning for efficientnet_b3,4 + import timm + from pprint import pprint + model_names = timm.list_models('*efficientnet*') + pprint(model_names) + model = timm.create_model(model_idx[0:15], pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:14] == 'ResN50_ViT_384': # ResNet+ViT融合模型384 + import timm + from pprint import pprint + model_names = timm.list_models('*vit_base_resnet*') + pprint(model_names) + model = timm.create_model('vit_base_resnet50_384', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:15] == 'coat_lite_small' and edge_size == 224: # Transfer learning for coat_lite_small + import timm + from pprint import pprint + + model_names = timm.list_models('*coat*') + pprint(model_names) + model = timm.create_model('coat_lite_small', pretrained=pretrained_backbone, num_classes=num_classes) + + elif model_idx[0:17] == 'efficientformer_l' and edge_size == 224: # Transfer learning for efficientnet_b3,4 + import timm + from pprint import pprint + model_names = timm.list_models('*efficientformer*') + pprint(model_names) + model = timm.create_model(model_idx[0:18], pretrained=pretrained_backbone, num_classes=num_classes) + + else: + print('\nThe model', model_idx, 'with the edge size of', edge_size) + print("is not defined in the script!!", '\n') + return -1 + + try: + img = torch.randn(1, 3, edge_size, edge_size) + preds = model(img) # (1, class_number) + print('test model output:', preds) + except: + print("Problem exist in the model defining process!!") + return -1 + else: + print('model is ready now!') + return model diff --git a/PuzzleTuning/Counterpart PreTrain Methods/ReadMe.md b/PuzzleTuning/Counterpart PreTrain Methods/ReadMe.md new file mode 100644 index 0000000000000000000000000000000000000000..9b0da00e2df00268598fe19e83413fbfc853fb48 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/ReadMe.md @@ -0,0 +1,2 @@ +we have used MOCO-V3, Dino, MAE to pretrain the ViT-base-224 model. +The official codes are implemented here. diff --git a/PuzzleTuning/Counterpart PreTrain Methods/dino-main/LICENSE b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..b09cd7856d58590578ee1a4f3ad45d1310a97f87 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/LICENSE @@ -0,0 +1,201 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/PuzzleTuning/Counterpart PreTrain Methods/dino-main/README.md b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/README.md new file mode 100644 index 0000000000000000000000000000000000000000..50c8534217e768b68f9a834a95ca80a3ca2b83fc --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/README.md @@ -0,0 +1,14 @@ +# Self-Supervised Vision Transformers with DINO + +The original repo of DINO could be found [here](https://github.com/facebookresearch/dino "DINO") + +Pip requirements: timm == 0.4.9, PyTorch == 1.7.1, Torchvision == 0.8.2, Cuda == 11.0 + +Typical BASH: + ```console +python -m torch.distributed.launch \ +--nproc_per_node=2 main_dino.py --arch vit_base --batch_size_per_gpu 128 \ +--lr 1.5e-4 --epochs 100 --data_path /root/autodl-tmp/All \ +--basic_state_dict /root/autodl-tmp/ViT_b16_224_Imagenet.pth \ +--num_workers 32 --output_dir the/path/of/CPIA + ``` diff --git a/PuzzleTuning/Counterpart PreTrain Methods/dino-main/eval_copy_detection.py b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/eval_copy_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..73dcd507893f204a47a5036cc61bd65b30cf1ead --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/eval_copy_detection.py @@ -0,0 +1,301 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +import pickle +import argparse + +import torch +from torch import nn +import torch.distributed as dist +import torch.backends.cudnn as cudnn +from torchvision import models as torchvision_models +from torchvision import transforms as pth_transforms +from PIL import Image, ImageFile +import numpy as np + +import utils +import vision_transformer as vits +from eval_knn import extract_features + + +class CopydaysDataset(): + def __init__(self, basedir): + self.basedir = basedir + self.block_names = ( + ['original', 'strong'] + + ['jpegqual/%d' % i for i in + [3, 5, 8, 10, 15, 20, 30, 50, 75]] + + ['crops/%d' % i for i in + [10, 15, 20, 30, 40, 50, 60, 70, 80]]) + self.nblocks = len(self.block_names) + + self.query_blocks = range(self.nblocks) + self.q_block_sizes = np.ones(self.nblocks, dtype=int) * 157 + self.q_block_sizes[1] = 229 + # search only among originals + self.database_blocks = [0] + + def get_block(self, i): + dirname = self.basedir + '/' + self.block_names[i] + fnames = [dirname + '/' + fname + for fname in sorted(os.listdir(dirname)) + if fname.endswith('.jpg')] + return fnames + + def get_block_filenames(self, subdir_name): + dirname = self.basedir + '/' + subdir_name + return [fname + for fname in sorted(os.listdir(dirname)) + if fname.endswith('.jpg')] + + def eval_result(self, ids, distances): + j0 = 0 + for i in range(self.nblocks): + j1 = j0 + self.q_block_sizes[i] + block_name = self.block_names[i] + I = ids[j0:j1] # block size + sum_AP = 0 + if block_name != 'strong': + # 1:1 mapping of files to names + positives_per_query = [[i] for i in range(j1 - j0)] + else: + originals = self.get_block_filenames('original') + strongs = self.get_block_filenames('strong') + + # check if prefixes match + positives_per_query = [ + [j for j, bname in enumerate(originals) + if bname[:4] == qname[:4]] + for qname in strongs] + + for qno, Iline in enumerate(I): + positives = positives_per_query[qno] + ranks = [] + for rank, bno in enumerate(Iline): + if bno in positives: + ranks.append(rank) + sum_AP += score_ap_from_ranks_1(ranks, len(positives)) + + print("eval on %s mAP=%.3f" % ( + block_name, sum_AP / (j1 - j0))) + j0 = j1 + + +# from the Holidays evaluation package +def score_ap_from_ranks_1(ranks, nres): + """ Compute the average precision of one search. + ranks = ordered list of ranks of true positives + nres = total number of positives in dataset + """ + + # accumulate trapezoids in PR-plot + ap = 0.0 + + # All have an x-size of: + recall_step = 1.0 / nres + + for ntp, rank in enumerate(ranks): + + # y-size on left side of trapezoid: + # ntp = nb of true positives so far + # rank = nb of retrieved items so far + if rank == 0: + precision_0 = 1.0 + else: + precision_0 = ntp / float(rank) + + # y-size on right side of trapezoid: + # ntp and rank are increased by one + precision_1 = (ntp + 1) / float(rank + 1) + + ap += (precision_1 + precision_0) * recall_step / 2.0 + + return ap + + +class ImgListDataset(torch.utils.data.Dataset): + def __init__(self, img_list, transform=None): + self.samples = img_list + self.transform = transform + + def __getitem__(self, i): + with open(self.samples[i], 'rb') as f: + img = Image.open(f) + img = img.convert('RGB') + if self.transform is not None: + img = self.transform(img) + return img, i + + def __len__(self): + return len(self.samples) + + +def is_image_file(s): + ext = s.split(".")[-1] + if ext in ['jpg', 'jpeg', 'png', 'ppm', 'bmp', 'pgm', 'tif', 'tiff', 'webp']: + return True + return False + + +@torch.no_grad() +def extract_features(image_list, model, args): + transform = pth_transforms.Compose([ + pth_transforms.Resize((args.imsize, args.imsize), interpolation=3), + pth_transforms.ToTensor(), + pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) + tempdataset = ImgListDataset(image_list, transform=transform) + data_loader = torch.utils.data.DataLoader(tempdataset, batch_size=args.batch_size_per_gpu, + num_workers=args.num_workers, drop_last=False, + sampler=torch.utils.data.DistributedSampler(tempdataset, shuffle=False)) + features = None + for samples, index in utils.MetricLogger(delimiter=" ").log_every(data_loader, 10): + samples, index = samples.cuda(non_blocking=True), index.cuda(non_blocking=True) + feats = model.get_intermediate_layers(samples, n=1)[0].clone() + + cls_output_token = feats[:, 0, :] # [CLS] token + # GeM with exponent 4 for output patch tokens + b, h, w, d = len(samples), int(samples.shape[-2] / model.patch_embed.patch_size), int(samples.shape[-1] / model.patch_embed.patch_size), feats.shape[-1] + feats = feats[:, 1:, :].reshape(b, h, w, d) + feats = feats.clamp(min=1e-6).permute(0, 3, 1, 2) + feats = nn.functional.avg_pool2d(feats.pow(4), (h, w)).pow(1. / 4).reshape(b, -1) + # concatenate [CLS] token and GeM pooled patch tokens + feats = torch.cat((cls_output_token, feats), dim=1) + + # init storage feature matrix + if dist.get_rank() == 0 and features is None: + features = torch.zeros(len(data_loader.dataset), feats.shape[-1]) + if args.use_cuda: + features = features.cuda(non_blocking=True) + + # get indexes from all processes + y_all = torch.empty(dist.get_world_size(), index.size(0), dtype=index.dtype, device=index.device) + y_l = list(y_all.unbind(0)) + y_all_reduce = torch.distributed.all_gather(y_l, index, async_op=True) + y_all_reduce.wait() + index_all = torch.cat(y_l) + + # share features between processes + feats_all = torch.empty(dist.get_world_size(), feats.size(0), feats.size(1), + dtype=feats.dtype, device=feats.device) + output_l = list(feats_all.unbind(0)) + output_all_reduce = torch.distributed.all_gather(output_l, feats, async_op=True) + output_all_reduce.wait() + + # update storage feature matrix + if dist.get_rank() == 0: + if args.use_cuda: + features.index_copy_(0, index_all, torch.cat(output_l)) + else: + features.index_copy_(0, index_all.cpu(), torch.cat(output_l).cpu()) + return features # features is still None for every rank which is not 0 (main) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('Copy detection on Copydays') + parser.add_argument('--data_path', default='/path/to/copydays/', type=str, + help="See https://lear.inrialpes.fr/~jegou/data.php#copydays") + parser.add_argument('--whitening_path', default='/path/to/whitening_data/', type=str, + help="""Path to directory with images used for computing the whitening operator. + In our paper, we use 20k random images from YFCC100M.""") + parser.add_argument('--distractors_path', default='/path/to/distractors/', type=str, + help="Path to directory with distractors images. In our paper, we use 10k random images from YFCC100M.") + parser.add_argument('--imsize', default=320, type=int, help='Image size (square image)') + parser.add_argument('--batch_size_per_gpu', default=16, type=int, help='Per-GPU batch-size') + parser.add_argument('--pretrained_weights', default='', type=str, help="Path to pretrained weights to evaluate.") + parser.add_argument('--use_cuda', default=True, type=utils.bool_flag) + parser.add_argument('--arch', default='vit_base', type=str, help='Architecture') + parser.add_argument('--patch_size', default=8, type=int, help='Patch resolution of the model.') + parser.add_argument("--checkpoint_key", default="teacher", type=str, + help='Key to use in the checkpoint (example: "teacher")') + parser.add_argument('--num_workers', default=10, type=int, help='Number of data loading workers per GPU.') + parser.add_argument("--dist_url", default="env://", type=str, help="""url used to set up + distributed training; see https://pytorch.org/docs/stable/distributed.html""") + parser.add_argument("--local_rank", default=0, type=int, help="Please ignore and do not set this argument.") + args = parser.parse_args() + + utils.init_distributed_mode(args) + print("git:\n {}\n".format(utils.get_sha())) + print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items()))) + cudnn.benchmark = True + + # ============ building network ... ============ + if "vit" in args.arch: + model = vits.__dict__[args.arch](patch_size=args.patch_size, num_classes=0) + print(f"Model {args.arch} {args.patch_size}x{args.patch_size} built.") + else: + print(f"Architecture {args.arch} non supported") + sys.exit(1) + if args.use_cuda: + model.cuda() + model.eval() + utils.load_pretrained_weights(model, args.pretrained_weights, args.checkpoint_key, args.arch, args.patch_size) + + dataset = CopydaysDataset(args.data_path) + + # ============ Extract features ... ============ + # extract features for queries + queries = [] + for q in dataset.query_blocks: + queries.append(extract_features(dataset.get_block(q), model, args)) + if utils.get_rank() == 0: + queries = torch.cat(queries) + print(f"Extraction of queries features done. Shape: {queries.shape}") + + # extract features for database + database = [] + for b in dataset.database_blocks: + database.append(extract_features(dataset.get_block(b), model, args)) + + # extract features for distractors + if os.path.isdir(args.distractors_path): + print("Using distractors...") + list_distractors = [os.path.join(args.distractors_path, s) for s in os.listdir(args.distractors_path) if is_image_file(s)] + database.append(extract_features(list_distractors, model, args)) + if utils.get_rank() == 0: + database = torch.cat(database) + print(f"Extraction of database and distractors features done. Shape: {database.shape}") + + # ============ Whitening ... ============ + if os.path.isdir(args.whitening_path): + print(f"Extracting features on images from {args.whitening_path} for learning the whitening operator.") + list_whit = [os.path.join(args.whitening_path, s) for s in os.listdir(args.whitening_path) if is_image_file(s)] + features_for_whitening = extract_features(list_whit, model, args) + if utils.get_rank() == 0: + # center + mean_feature = torch.mean(features_for_whitening, dim=0) + database -= mean_feature + queries -= mean_feature + pca = utils.PCA(dim=database.shape[-1], whit=0.5) + # compute covariance + cov = torch.mm(features_for_whitening.T, features_for_whitening) / features_for_whitening.shape[0] + pca.train_pca(cov.cpu().numpy()) + database = pca.apply(database) + queries = pca.apply(queries) + + # ============ Copy detection ... ============ + if utils.get_rank() == 0: + # l2 normalize the features + database = nn.functional.normalize(database, dim=1, p=2) + queries = nn.functional.normalize(queries, dim=1, p=2) + + # similarity + similarity = torch.mm(queries, database.T) + distances, indices = similarity.topk(20, largest=True, sorted=True) + + # evaluate + retrieved = dataset.eval_result(indices, distances) + dist.barrier() + diff --git a/PuzzleTuning/Counterpart PreTrain Methods/dino-main/eval_image_retrieval.py b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/eval_image_retrieval.py new file mode 100644 index 0000000000000000000000000000000000000000..999f8c9009a9abcc28308c5995c286f65b1522ac --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/eval_image_retrieval.py @@ -0,0 +1,201 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +import pickle +import argparse + +import torch +from torch import nn +import torch.distributed as dist +import torch.backends.cudnn as cudnn +from torchvision import models as torchvision_models +from torchvision import transforms as pth_transforms +from PIL import Image, ImageFile +import numpy as np + +import utils +import vision_transformer as vits +from eval_knn import extract_features + + +class OxfordParisDataset(torch.utils.data.Dataset): + def __init__(self, dir_main, dataset, split, transform=None, imsize=None): + if dataset not in ['roxford5k', 'rparis6k']: + raise ValueError('Unknown dataset: {}!'.format(dataset)) + + # loading imlist, qimlist, and gnd, in cfg as a dict + gnd_fname = os.path.join(dir_main, dataset, 'gnd_{}.pkl'.format(dataset)) + with open(gnd_fname, 'rb') as f: + cfg = pickle.load(f) + cfg['gnd_fname'] = gnd_fname + cfg['ext'] = '.jpg' + cfg['qext'] = '.jpg' + cfg['dir_data'] = os.path.join(dir_main, dataset) + cfg['dir_images'] = os.path.join(cfg['dir_data'], 'jpg') + cfg['n'] = len(cfg['imlist']) + cfg['nq'] = len(cfg['qimlist']) + cfg['im_fname'] = config_imname + cfg['qim_fname'] = config_qimname + cfg['dataset'] = dataset + self.cfg = cfg + + self.samples = cfg["qimlist"] if split == "query" else cfg["imlist"] + self.transform = transform + self.imsize = imsize + + def __len__(self): + return len(self.samples) + + def __getitem__(self, index): + path = os.path.join(self.cfg["dir_images"], self.samples[index] + ".jpg") + ImageFile.LOAD_TRUNCATED_IMAGES = True + with open(path, 'rb') as f: + img = Image.open(f) + img = img.convert('RGB') + if self.imsize is not None: + img.thumbnail((self.imsize, self.imsize), Image.ANTIALIAS) + if self.transform is not None: + img = self.transform(img) + return img, index + + +def config_imname(cfg, i): + return os.path.join(cfg['dir_images'], cfg['imlist'][i] + cfg['ext']) + + +def config_qimname(cfg, i): + return os.path.join(cfg['dir_images'], cfg['qimlist'][i] + cfg['qext']) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('Image Retrieval on revisited Paris and Oxford') + parser.add_argument('--data_path', default='/path/to/revisited_paris_oxford/', type=str) + parser.add_argument('--dataset', default='roxford5k', type=str, choices=['roxford5k', 'rparis6k']) + parser.add_argument('--multiscale', default=False, type=utils.bool_flag) + parser.add_argument('--imsize', default=224, type=int, help='Image size') + parser.add_argument('--pretrained_weights', default='', type=str, help="Path to pretrained weights to evaluate.") + parser.add_argument('--use_cuda', default=True, type=utils.bool_flag) + parser.add_argument('--arch', default='vit_small', type=str, help='Architecture') + parser.add_argument('--patch_size', default=16, type=int, help='Patch resolution of the model.') + parser.add_argument("--checkpoint_key", default="teacher", type=str, + help='Key to use in the checkpoint (example: "teacher")') + parser.add_argument('--num_workers', default=10, type=int, help='Number of data loading workers per GPU.') + parser.add_argument("--dist_url", default="env://", type=str, help="""url used to set up + distributed training; see https://pytorch.org/docs/stable/distributed.html""") + parser.add_argument("--local_rank", default=0, type=int, help="Please ignore and do not set this argument.") + args = parser.parse_args() + + utils.init_distributed_mode(args) + print("git:\n {}\n".format(utils.get_sha())) + print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items()))) + cudnn.benchmark = True + + # ============ preparing data ... ============ + transform = pth_transforms.Compose([ + pth_transforms.ToTensor(), + pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) + dataset_train = OxfordParisDataset(args.data_path, args.dataset, split="train", transform=transform, imsize=args.imsize) + dataset_query = OxfordParisDataset(args.data_path, args.dataset, split="query", transform=transform, imsize=args.imsize) + sampler = torch.utils.data.DistributedSampler(dataset_train, shuffle=False) + data_loader_train = torch.utils.data.DataLoader( + dataset_train, + sampler=sampler, + batch_size=1, + num_workers=args.num_workers, + pin_memory=True, + drop_last=False, + ) + data_loader_query = torch.utils.data.DataLoader( + dataset_query, + batch_size=1, + num_workers=args.num_workers, + pin_memory=True, + drop_last=False, + ) + print(f"train: {len(dataset_train)} imgs / query: {len(dataset_query)} imgs") + + # ============ building network ... ============ + if "vit" in args.arch: + model = vits.__dict__[args.arch](patch_size=args.patch_size, num_classes=0) + print(f"Model {args.arch} {args.patch_size}x{args.patch_size} built.") + elif "xcit" in args.arch: + model = torch.hub.load('facebookresearch/xcit:main', args.arch, num_classes=0) + elif args.arch in torchvision_models.__dict__.keys(): + model = torchvision_models.__dict__[args.arch](num_classes=0) + else: + print(f"Architecture {args.arch} non supported") + sys.exit(1) + if args.use_cuda: + model.cuda() + model.eval() + + # load pretrained weights + if os.path.isfile(args.pretrained_weights): + state_dict = torch.load(args.pretrained_weights, map_location="cpu") + if args.checkpoint_key is not None and args.checkpoint_key in state_dict: + print(f"Take key {args.checkpoint_key} in provided checkpoint dict") + state_dict = state_dict[args.checkpoint_key] + # remove `module.` prefix + state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} + # remove `backbone.` prefix induced by multicrop wrapper + state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()} + msg = model.load_state_dict(state_dict, strict=False) + print('Pretrained weights found at {} and loaded with msg: {}'.format(args.pretrained_weights, msg)) + elif args.arch == "vit_small" and args.patch_size == 16: + print("Since no pretrained weights have been provided, we load pretrained DINO weights on Google Landmark v2.") + model.load_state_dict(torch.hub.load_state_dict_from_url(url="https://dl.fbaipublicfiles.com/dino/dino_vitsmall16_googlelandmark_pretrain/dino_vitsmall16_googlelandmark_pretrain.pth")) + else: + print("Warning: We use random weights.") + + ############################################################################ + # Step 1: extract features + train_features = extract_features(model, data_loader_train, args.use_cuda, multiscale=args.multiscale) + query_features = extract_features(model, data_loader_query, args.use_cuda, multiscale=args.multiscale) + + if utils.get_rank() == 0: # only rank 0 will work from now on + # normalize features + train_features = nn.functional.normalize(train_features, dim=1, p=2) + query_features = nn.functional.normalize(query_features, dim=1, p=2) + + ############################################################################ + # Step 2: similarity + sim = torch.mm(train_features, query_features.T) + ranks = torch.argsort(-sim, dim=0).cpu().numpy() + + ############################################################################ + # Step 3: evaluate + gnd = dataset_train.cfg['gnd'] + # evaluate ranks + ks = [1, 5, 10] + # search for easy & hard + gnd_t = [] + for i in range(len(gnd)): + g = {} + g['ok'] = np.concatenate([gnd[i]['easy'], gnd[i]['hard']]) + g['junk'] = np.concatenate([gnd[i]['junk']]) + gnd_t.append(g) + mapM, apsM, mprM, prsM = utils.compute_map(ranks, gnd_t, ks) + # search for hard + gnd_t = [] + for i in range(len(gnd)): + g = {} + g['ok'] = np.concatenate([gnd[i]['hard']]) + g['junk'] = np.concatenate([gnd[i]['junk'], gnd[i]['easy']]) + gnd_t.append(g) + mapH, apsH, mprH, prsH = utils.compute_map(ranks, gnd_t, ks) + print('>> {}: mAP M: {}, H: {}'.format(args.dataset, np.around(mapM*100, decimals=2), np.around(mapH*100, decimals=2))) + print('>> {}: mP@k{} M: {}, H: {}'.format(args.dataset, np.array(ks), np.around(mprM*100, decimals=2), np.around(mprH*100, decimals=2))) + dist.barrier() diff --git a/PuzzleTuning/Counterpart PreTrain Methods/dino-main/eval_knn.py b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/eval_knn.py new file mode 100644 index 0000000000000000000000000000000000000000..fe99a26049cda2d764086727223e6cc9a8f2bfb8 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/eval_knn.py @@ -0,0 +1,242 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +import argparse + +import torch +from torch import nn +import torch.distributed as dist +import torch.backends.cudnn as cudnn +from torchvision import datasets +from torchvision import transforms as pth_transforms +from torchvision import models as torchvision_models + +import utils +import vision_transformer as vits + + +def extract_feature_pipeline(args): + # ============ preparing data ... ============ + transform = pth_transforms.Compose([ + pth_transforms.Resize(256, interpolation=3), + pth_transforms.CenterCrop(224), + pth_transforms.ToTensor(), + pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) + dataset_train = ReturnIndexDataset(os.path.join(args.data_path, "train"), transform=transform) + dataset_val = ReturnIndexDataset(os.path.join(args.data_path, "val"), transform=transform) + sampler = torch.utils.data.DistributedSampler(dataset_train, shuffle=False) + data_loader_train = torch.utils.data.DataLoader( + dataset_train, + sampler=sampler, + batch_size=args.batch_size_per_gpu, + num_workers=args.num_workers, + pin_memory=True, + drop_last=False, + ) + data_loader_val = torch.utils.data.DataLoader( + dataset_val, + batch_size=args.batch_size_per_gpu, + num_workers=args.num_workers, + pin_memory=True, + drop_last=False, + ) + print(f"Data loaded with {len(dataset_train)} train and {len(dataset_val)} val imgs.") + + # ============ building network ... ============ + if "vit" in args.arch: + model = vits.__dict__[args.arch](patch_size=args.patch_size, num_classes=0) + print(f"Model {args.arch} {args.patch_size}x{args.patch_size} built.") + elif "xcit" in args.arch: + model = torch.hub.load('facebookresearch/xcit:main', args.arch, num_classes=0) + elif args.arch in torchvision_models.__dict__.keys(): + model = torchvision_models.__dict__[args.arch](num_classes=0) + model.fc = nn.Identity() + else: + print(f"Architecture {args.arch} non supported") + sys.exit(1) + model.cuda() + utils.load_pretrained_weights(model, args.pretrained_weights, args.checkpoint_key, args.arch, args.patch_size) + model.eval() + + # ============ extract features ... ============ + print("Extracting features for train set...") + train_features = extract_features(model, data_loader_train, args.use_cuda) + print("Extracting features for val set...") + test_features = extract_features(model, data_loader_val, args.use_cuda) + + if utils.get_rank() == 0: + train_features = nn.functional.normalize(train_features, dim=1, p=2) + test_features = nn.functional.normalize(test_features, dim=1, p=2) + + train_labels = torch.tensor([s[-1] for s in dataset_train.samples]).long() + test_labels = torch.tensor([s[-1] for s in dataset_val.samples]).long() + # save features and labels + if args.dump_features and dist.get_rank() == 0: + torch.save(train_features.cpu(), os.path.join(args.dump_features, "trainfeat.pth")) + torch.save(test_features.cpu(), os.path.join(args.dump_features, "testfeat.pth")) + torch.save(train_labels.cpu(), os.path.join(args.dump_features, "trainlabels.pth")) + torch.save(test_labels.cpu(), os.path.join(args.dump_features, "testlabels.pth")) + return train_features, test_features, train_labels, test_labels + + +@torch.no_grad() +def extract_features(model, data_loader, use_cuda=True, multiscale=False): + metric_logger = utils.MetricLogger(delimiter=" ") + features = None + for samples, index in metric_logger.log_every(data_loader, 10): + samples = samples.cuda(non_blocking=True) + index = index.cuda(non_blocking=True) + if multiscale: + feats = utils.multi_scale(samples, model) + else: + feats = model(samples).clone() + + # init storage feature matrix + if dist.get_rank() == 0 and features is None: + features = torch.zeros(len(data_loader.dataset), feats.shape[-1]) + if use_cuda: + features = features.cuda(non_blocking=True) + print(f"Storing features into tensor of shape {features.shape}") + + # get indexes from all processes + y_all = torch.empty(dist.get_world_size(), index.size(0), dtype=index.dtype, device=index.device) + y_l = list(y_all.unbind(0)) + y_all_reduce = torch.distributed.all_gather(y_l, index, async_op=True) + y_all_reduce.wait() + index_all = torch.cat(y_l) + + # share features between processes + feats_all = torch.empty( + dist.get_world_size(), + feats.size(0), + feats.size(1), + dtype=feats.dtype, + device=feats.device, + ) + output_l = list(feats_all.unbind(0)) + output_all_reduce = torch.distributed.all_gather(output_l, feats, async_op=True) + output_all_reduce.wait() + + # update storage feature matrix + if dist.get_rank() == 0: + if use_cuda: + features.index_copy_(0, index_all, torch.cat(output_l)) + else: + features.index_copy_(0, index_all.cpu(), torch.cat(output_l).cpu()) + return features + + +@torch.no_grad() +def knn_classifier(train_features, train_labels, test_features, test_labels, k, T, num_classes=1000): + top1, top5, total = 0.0, 0.0, 0 + train_features = train_features.t() + num_test_images, num_chunks = test_labels.shape[0], 100 + imgs_per_chunk = num_test_images // num_chunks + retrieval_one_hot = torch.zeros(k, num_classes).to(train_features.device) + for idx in range(0, num_test_images, imgs_per_chunk): + # get the features for test images + features = test_features[ + idx : min((idx + imgs_per_chunk), num_test_images), : + ] + targets = test_labels[idx : min((idx + imgs_per_chunk), num_test_images)] + batch_size = targets.shape[0] + + # calculate the dot product and compute top-k neighbors + similarity = torch.mm(features, train_features) + distances, indices = similarity.topk(k, largest=True, sorted=True) + candidates = train_labels.view(1, -1).expand(batch_size, -1) + retrieved_neighbors = torch.gather(candidates, 1, indices) + + retrieval_one_hot.resize_(batch_size * k, num_classes).zero_() + retrieval_one_hot.scatter_(1, retrieved_neighbors.view(-1, 1), 1) + distances_transform = distances.clone().div_(T).exp_() + probs = torch.sum( + torch.mul( + retrieval_one_hot.view(batch_size, -1, num_classes), + distances_transform.view(batch_size, -1, 1), + ), + 1, + ) + _, predictions = probs.sort(1, True) + + # find the predictions that match the target + correct = predictions.eq(targets.data.view(-1, 1)) + top1 = top1 + correct.narrow(1, 0, 1).sum().item() + top5 = top5 + correct.narrow(1, 0, min(5, k)).sum().item() # top5 does not make sense if k < 5 + total += targets.size(0) + top1 = top1 * 100.0 / total + top5 = top5 * 100.0 / total + return top1, top5 + + +class ReturnIndexDataset(datasets.ImageFolder): + def __getitem__(self, idx): + img, lab = super(ReturnIndexDataset, self).__getitem__(idx) + return img, idx + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('Evaluation with weighted k-NN on ImageNet') + parser.add_argument('--batch_size_per_gpu', default=128, type=int, help='Per-GPU batch-size') + parser.add_argument('--nb_knn', default=[10, 20, 100, 200], nargs='+', type=int, + help='Number of NN to use. 20 is usually working the best.') + parser.add_argument('--temperature', default=0.07, type=float, + help='Temperature used in the voting coefficient') + parser.add_argument('--pretrained_weights', default='', type=str, help="Path to pretrained weights to evaluate.") + parser.add_argument('--use_cuda', default=True, type=utils.bool_flag, + help="Should we store the features on GPU? We recommend setting this to False if you encounter OOM") + parser.add_argument('--arch', default='vit_small', type=str, help='Architecture') + parser.add_argument('--patch_size', default=16, type=int, help='Patch resolution of the model.') + parser.add_argument("--checkpoint_key", default="teacher", type=str, + help='Key to use in the checkpoint (example: "teacher")') + parser.add_argument('--dump_features', default=None, + help='Path where to save computed features, empty for no saving') + parser.add_argument('--load_features', default=None, help="""If the features have + already been computed, where to find them.""") + parser.add_argument('--num_workers', default=10, type=int, help='Number of data loading workers per GPU.') + parser.add_argument("--dist_url", default="env://", type=str, help="""url used to set up + distributed training; see https://pytorch.org/docs/stable/distributed.html""") + parser.add_argument("--local_rank", default=0, type=int, help="Please ignore and do not set this argument.") + parser.add_argument('--data_path', default='/path/to/imagenet/', type=str) + args = parser.parse_args() + + utils.init_distributed_mode(args) + print("git:\n {}\n".format(utils.get_sha())) + print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items()))) + cudnn.benchmark = True + + if args.load_features: + train_features = torch.load(os.path.join(args.load_features, "trainfeat.pth")) + test_features = torch.load(os.path.join(args.load_features, "testfeat.pth")) + train_labels = torch.load(os.path.join(args.load_features, "trainlabels.pth")) + test_labels = torch.load(os.path.join(args.load_features, "testlabels.pth")) + else: + # need to extract features ! + train_features, test_features, train_labels, test_labels = extract_feature_pipeline(args) + + if utils.get_rank() == 0: + if args.use_cuda: + train_features = train_features.cuda() + test_features = test_features.cuda() + train_labels = train_labels.cuda() + test_labels = test_labels.cuda() + + print("Features are ready!\nStart the k-NN classification.") + for k in args.nb_knn: + top1, top5 = knn_classifier(train_features, train_labels, + test_features, test_labels, k, args.temperature) + print(f"{k}-NN classifier result: Top1: {top1}, Top5: {top5}") + dist.barrier() diff --git a/PuzzleTuning/Counterpart PreTrain Methods/dino-main/eval_linear.py b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/eval_linear.py new file mode 100644 index 0000000000000000000000000000000000000000..cdef16b473d216889b493aa0c7a63e15f945092c --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/eval_linear.py @@ -0,0 +1,281 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import argparse +import json +from pathlib import Path + +import torch +from torch import nn +import torch.distributed as dist +import torch.backends.cudnn as cudnn +from torchvision import datasets +from torchvision import transforms as pth_transforms +from torchvision import models as torchvision_models + +import utils +import vision_transformer as vits + + +def eval_linear(args): + utils.init_distributed_mode(args) + print("git:\n {}\n".format(utils.get_sha())) + print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items()))) + cudnn.benchmark = True + + # ============ building network ... ============ + # if the network is a Vision Transformer (i.e. vit_tiny, vit_small, vit_base) + if args.arch in vits.__dict__.keys(): + model = vits.__dict__[args.arch](patch_size=args.patch_size, num_classes=0) + embed_dim = model.embed_dim * (args.n_last_blocks + int(args.avgpool_patchtokens)) + # if the network is a XCiT + elif "xcit" in args.arch: + model = torch.hub.load('facebookresearch/xcit:main', args.arch, num_classes=0) + embed_dim = model.embed_dim + # otherwise, we check if the architecture is in torchvision models + elif args.arch in torchvision_models.__dict__.keys(): + model = torchvision_models.__dict__[args.arch]() + embed_dim = model.fc.weight.shape[1] + model.fc = nn.Identity() + else: + print(f"Unknow architecture: {args.arch}") + sys.exit(1) + model.cuda() + model.eval() + # load weights to evaluate + utils.load_pretrained_weights(model, args.pretrained_weights, args.checkpoint_key, args.arch, args.patch_size) + print(f"Model {args.arch} built.") + + linear_classifier = LinearClassifier(embed_dim, num_labels=args.num_labels) + linear_classifier = linear_classifier.cuda() + linear_classifier = nn.parallel.DistributedDataParallel(linear_classifier, device_ids=[args.gpu]) + + # ============ preparing data ... ============ + val_transform = pth_transforms.Compose([ + pth_transforms.Resize(256, interpolation=3), + pth_transforms.CenterCrop(224), + pth_transforms.ToTensor(), + pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) + dataset_val = datasets.ImageFolder(os.path.join(args.data_path, "val"), transform=val_transform) + val_loader = torch.utils.data.DataLoader( + dataset_val, + batch_size=args.batch_size_per_gpu, + num_workers=args.num_workers, + pin_memory=True, + ) + + if args.evaluate: + utils.load_pretrained_linear_weights(linear_classifier, args.arch, args.patch_size) + test_stats = validate_network(val_loader, model, linear_classifier, args.n_last_blocks, args.avgpool_patchtokens) + print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%") + return + + train_transform = pth_transforms.Compose([ + pth_transforms.RandomResizedCrop(224), + pth_transforms.RandomHorizontalFlip(), + pth_transforms.ToTensor(), + pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) + dataset_train = datasets.ImageFolder(os.path.join(args.data_path, "train"), transform=train_transform) + sampler = torch.utils.data.distributed.DistributedSampler(dataset_train) + train_loader = torch.utils.data.DataLoader( + dataset_train, + sampler=sampler, + batch_size=args.batch_size_per_gpu, + num_workers=args.num_workers, + pin_memory=True, + ) + print(f"Data loaded with {len(dataset_train)} train and {len(dataset_val)} val imgs.") + + # set optimizer + optimizer = torch.optim.SGD( + linear_classifier.parameters(), + args.lr * (args.batch_size_per_gpu * utils.get_world_size()) / 256., # linear scaling rule + momentum=0.9, + weight_decay=0, # we do not apply weight decay + ) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=0) + + # Optionally resume from a checkpoint + to_restore = {"epoch": 0, "best_acc": 0.} + utils.restart_from_checkpoint( + os.path.join(args.output_dir, "checkpoint.pth.tar"), + run_variables=to_restore, + state_dict=linear_classifier, + optimizer=optimizer, + scheduler=scheduler, + ) + start_epoch = to_restore["epoch"] + best_acc = to_restore["best_acc"] + + for epoch in range(start_epoch, args.epochs): + train_loader.sampler.set_epoch(epoch) + + train_stats = train(model, linear_classifier, optimizer, train_loader, epoch, args.n_last_blocks, args.avgpool_patchtokens) + scheduler.step() + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + 'epoch': epoch} + if epoch % args.val_freq == 0 or epoch == args.epochs - 1: + test_stats = validate_network(val_loader, model, linear_classifier, args.n_last_blocks, args.avgpool_patchtokens) + print(f"Accuracy at epoch {epoch} of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%") + best_acc = max(best_acc, test_stats["acc1"]) + print(f'Max accuracy so far: {best_acc:.2f}%') + log_stats = {**{k: v for k, v in log_stats.items()}, + **{f'test_{k}': v for k, v in test_stats.items()}} + if utils.is_main_process(): + with (Path(args.output_dir) / "log.txt").open("a") as f: + f.write(json.dumps(log_stats) + "\n") + save_dict = { + "epoch": epoch + 1, + "state_dict": linear_classifier.state_dict(), + "optimizer": optimizer.state_dict(), + "scheduler": scheduler.state_dict(), + "best_acc": best_acc, + } + torch.save(save_dict, os.path.join(args.output_dir, "checkpoint.pth.tar")) + print("Training of the supervised linear classifier on frozen features completed.\n" + "Top-1 test accuracy: {acc:.1f}".format(acc=best_acc)) + + +def train(model, linear_classifier, optimizer, loader, epoch, n, avgpool): + linear_classifier.train() + metric_logger = utils.MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) + header = 'Epoch: [{}]'.format(epoch) + for (inp, target) in metric_logger.log_every(loader, 20, header): + # move to gpu + inp = inp.cuda(non_blocking=True) + target = target.cuda(non_blocking=True) + + # forward + with torch.no_grad(): + if "vit" in args.arch: + intermediate_output = model.get_intermediate_layers(inp, n) + output = torch.cat([x[:, 0] for x in intermediate_output], dim=-1) + if avgpool: + output = torch.cat((output.unsqueeze(-1), torch.mean(intermediate_output[-1][:, 1:], dim=1).unsqueeze(-1)), dim=-1) + output = output.reshape(output.shape[0], -1) + else: + output = model(inp) + output = linear_classifier(output) + + # compute cross entropy loss + loss = nn.CrossEntropyLoss()(output, target) + + # compute the gradients + optimizer.zero_grad() + loss.backward() + + # step + optimizer.step() + + # log + torch.cuda.synchronize() + metric_logger.update(loss=loss.item()) + metric_logger.update(lr=optimizer.param_groups[0]["lr"]) + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} + + +@torch.no_grad() +def validate_network(val_loader, model, linear_classifier, n, avgpool): + linear_classifier.eval() + metric_logger = utils.MetricLogger(delimiter=" ") + header = 'Test:' + for inp, target in metric_logger.log_every(val_loader, 20, header): + # move to gpu + inp = inp.cuda(non_blocking=True) + target = target.cuda(non_blocking=True) + + # forward + with torch.no_grad(): + if "vit" in args.arch: + intermediate_output = model.get_intermediate_layers(inp, n) + output = torch.cat([x[:, 0] for x in intermediate_output], dim=-1) + if avgpool: + output = torch.cat((output.unsqueeze(-1), torch.mean(intermediate_output[-1][:, 1:], dim=1).unsqueeze(-1)), dim=-1) + output = output.reshape(output.shape[0], -1) + else: + output = model(inp) + output = linear_classifier(output) + loss = nn.CrossEntropyLoss()(output, target) + + if linear_classifier.module.num_labels >= 5: + acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) + else: + acc1, = utils.accuracy(output, target, topk=(1,)) + + batch_size = inp.shape[0] + metric_logger.update(loss=loss.item()) + metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) + if linear_classifier.module.num_labels >= 5: + metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) + if linear_classifier.module.num_labels >= 5: + print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}' + .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss)) + else: + print('* Acc@1 {top1.global_avg:.3f} loss {losses.global_avg:.3f}' + .format(top1=metric_logger.acc1, losses=metric_logger.loss)) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} + + +class LinearClassifier(nn.Module): + """Linear layer to train on top of frozen features""" + def __init__(self, dim, num_labels=1000): + super(LinearClassifier, self).__init__() + self.num_labels = num_labels + self.linear = nn.Linear(dim, num_labels) + self.linear.weight.data.normal_(mean=0.0, std=0.01) + self.linear.bias.data.zero_() + + def forward(self, x): + # flatten + x = x.view(x.size(0), -1) + + # linear layer + return self.linear(x) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('Evaluation with linear classification on ImageNet') + parser.add_argument('--n_last_blocks', default=4, type=int, help="""Concatenate [CLS] tokens + for the `n` last blocks. We use `n=4` when evaluating ViT-Small and `n=1` with ViT-Base.""") + parser.add_argument('--avgpool_patchtokens', default=False, type=utils.bool_flag, + help="""Whether ot not to concatenate the global average pooled features to the [CLS] token. + We typically set this to False for ViT-Small and to True with ViT-Base.""") + parser.add_argument('--arch', default='vit_small', type=str, help='Architecture') + parser.add_argument('--patch_size', default=16, type=int, help='Patch resolution of the model.') + parser.add_argument('--pretrained_weights', default='', type=str, help="Path to pretrained weights to evaluate.") + parser.add_argument("--checkpoint_key", default="teacher", type=str, help='Key to use in the checkpoint (example: "teacher")') + parser.add_argument('--epochs', default=100, type=int, help='Number of epochs of training.') + parser.add_argument("--lr", default=0.001, type=float, help="""Learning rate at the beginning of + training (highest LR used during training). The learning rate is linearly scaled + with the batch size, and specified here for a reference batch size of 256. + We recommend tweaking the LR depending on the checkpoint evaluated.""") + parser.add_argument('--batch_size_per_gpu', default=128, type=int, help='Per-GPU batch-size') + parser.add_argument("--dist_url", default="env://", type=str, help="""url used to set up + distributed training; see https://pytorch.org/docs/stable/distributed.html""") + parser.add_argument("--local_rank", default=0, type=int, help="Please ignore and do not set this argument.") + parser.add_argument('--data_path', default='/path/to/imagenet/', type=str) + parser.add_argument('--num_workers', default=10, type=int, help='Number of data loading workers per GPU.') + parser.add_argument('--val_freq', default=1, type=int, help="Epoch frequency for validation.") + parser.add_argument('--output_dir', default=".", help='Path to save logs and checkpoints') + parser.add_argument('--num_labels', default=1000, type=int, help='Number of labels for linear classifier') + parser.add_argument('--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set') + args = parser.parse_args() + eval_linear(args) diff --git a/PuzzleTuning/Counterpart PreTrain Methods/dino-main/eval_video_segmentation.py b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/eval_video_segmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..08a18c475db9cbadb29d2e0f22113c0cc9efed49 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/eval_video_segmentation.py @@ -0,0 +1,292 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Some parts are taken from https://github.com/Liusifei/UVC +""" +import os +import copy +import glob +import queue +from urllib.request import urlopen +import argparse +import numpy as np +from tqdm import tqdm + +import cv2 +import torch +import torch.nn as nn +from torch.nn import functional as F +from PIL import Image +from torchvision import transforms + +import utils +import vision_transformer as vits + + +@torch.no_grad() +def eval_video_tracking_davis(args, model, frame_list, video_dir, first_seg, seg_ori, color_palette): + """ + Evaluate tracking on a video given first frame & segmentation + """ + video_folder = os.path.join(args.output_dir, video_dir.split('/')[-1]) + os.makedirs(video_folder, exist_ok=True) + + # The queue stores the n preceeding frames + que = queue.Queue(args.n_last_frames) + + # first frame + frame1, ori_h, ori_w = read_frame(frame_list[0]) + # extract first frame feature + frame1_feat = extract_feature(model, frame1).T # dim x h*w + + # saving first segmentation + out_path = os.path.join(video_folder, "00000.png") + imwrite_indexed(out_path, seg_ori, color_palette) + mask_neighborhood = None + for cnt in tqdm(range(1, len(frame_list))): + frame_tar = read_frame(frame_list[cnt])[0] + + # we use the first segmentation and the n previous ones + used_frame_feats = [frame1_feat] + [pair[0] for pair in list(que.queue)] + used_segs = [first_seg] + [pair[1] for pair in list(que.queue)] + + frame_tar_avg, feat_tar, mask_neighborhood = label_propagation(args, model, frame_tar, used_frame_feats, used_segs, mask_neighborhood) + + # pop out oldest frame if neccessary + if que.qsize() == args.n_last_frames: + que.get() + # push current results into queue + seg = copy.deepcopy(frame_tar_avg) + que.put([feat_tar, seg]) + + # upsampling & argmax + frame_tar_avg = F.interpolate(frame_tar_avg, scale_factor=args.patch_size, mode='bilinear', align_corners=False, recompute_scale_factor=False)[0] + frame_tar_avg = norm_mask(frame_tar_avg) + _, frame_tar_seg = torch.max(frame_tar_avg, dim=0) + + # saving to disk + frame_tar_seg = np.array(frame_tar_seg.squeeze().cpu(), dtype=np.uint8) + frame_tar_seg = np.array(Image.fromarray(frame_tar_seg).resize((ori_w, ori_h), 0)) + frame_nm = frame_list[cnt].split('/')[-1].replace(".jpg", ".png") + imwrite_indexed(os.path.join(video_folder, frame_nm), frame_tar_seg, color_palette) + + +def restrict_neighborhood(h, w): + # We restrict the set of source nodes considered to a spatial neighborhood of the query node (i.e. ``local attention'') + mask = torch.zeros(h, w, h, w) + for i in range(h): + for j in range(w): + for p in range(2 * args.size_mask_neighborhood + 1): + for q in range(2 * args.size_mask_neighborhood + 1): + if i - args.size_mask_neighborhood + p < 0 or i - args.size_mask_neighborhood + p >= h: + continue + if j - args.size_mask_neighborhood + q < 0 or j - args.size_mask_neighborhood + q >= w: + continue + mask[i, j, i - args.size_mask_neighborhood + p, j - args.size_mask_neighborhood + q] = 1 + + mask = mask.reshape(h * w, h * w) + return mask.cuda(non_blocking=True) + + +def norm_mask(mask): + c, h, w = mask.size() + for cnt in range(c): + mask_cnt = mask[cnt,:,:] + if(mask_cnt.max() > 0): + mask_cnt = (mask_cnt - mask_cnt.min()) + mask_cnt = mask_cnt/mask_cnt.max() + mask[cnt,:,:] = mask_cnt + return mask + + +def label_propagation(args, model, frame_tar, list_frame_feats, list_segs, mask_neighborhood=None): + """ + propagate segs of frames in list_frames to frame_tar + """ + ## we only need to extract feature of the target frame + feat_tar, h, w = extract_feature(model, frame_tar, return_h_w=True) + + return_feat_tar = feat_tar.T # dim x h*w + + ncontext = len(list_frame_feats) + feat_sources = torch.stack(list_frame_feats) # nmb_context x dim x h*w + + feat_tar = F.normalize(feat_tar, dim=1, p=2) + feat_sources = F.normalize(feat_sources, dim=1, p=2) + + feat_tar = feat_tar.unsqueeze(0).repeat(ncontext, 1, 1) + aff = torch.exp(torch.bmm(feat_tar, feat_sources) / 0.1) # nmb_context x h*w (tar: query) x h*w (source: keys) + + if args.size_mask_neighborhood > 0: + if mask_neighborhood is None: + mask_neighborhood = restrict_neighborhood(h, w) + mask_neighborhood = mask_neighborhood.unsqueeze(0).repeat(ncontext, 1, 1) + aff *= mask_neighborhood + + aff = aff.transpose(2, 1).reshape(-1, h * w) # nmb_context*h*w (source: keys) x h*w (tar: queries) + tk_val, _ = torch.topk(aff, dim=0, k=args.topk) + tk_val_min, _ = torch.min(tk_val, dim=0) + aff[aff < tk_val_min] = 0 + + aff = aff / torch.sum(aff, keepdim=True, axis=0) + + list_segs = [s.cuda() for s in list_segs] + segs = torch.cat(list_segs) + nmb_context, C, h, w = segs.shape + segs = segs.reshape(nmb_context, C, -1).transpose(2, 1).reshape(-1, C).T # C x nmb_context*h*w + seg_tar = torch.mm(segs, aff) + seg_tar = seg_tar.reshape(1, C, h, w) + return seg_tar, return_feat_tar, mask_neighborhood + + +def extract_feature(model, frame, return_h_w=False): + """Extract one frame feature everytime.""" + out = model.get_intermediate_layers(frame.unsqueeze(0).cuda(), n=1)[0] + out = out[:, 1:, :] # we discard the [CLS] token + h, w = int(frame.shape[1] / model.patch_embed.patch_size), int(frame.shape[2] / model.patch_embed.patch_size) + dim = out.shape[-1] + out = out[0].reshape(h, w, dim) + out = out.reshape(-1, dim) + if return_h_w: + return out, h, w + return out + + +def imwrite_indexed(filename, array, color_palette): + """ Save indexed png for DAVIS.""" + if np.atleast_3d(array).shape[2] != 1: + raise Exception("Saving indexed PNGs requires 2D array.") + + im = Image.fromarray(array) + im.putpalette(color_palette.ravel()) + im.save(filename, format='PNG') + + +def to_one_hot(y_tensor, n_dims=None): + """ + Take integer y (tensor or variable) with n dims & + convert it to 1-hot representation with n+1 dims. + """ + if(n_dims is None): + n_dims = int(y_tensor.max()+ 1) + _,h,w = y_tensor.size() + y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1) + n_dims = n_dims if n_dims is not None else int(torch.max(y_tensor)) + 1 + y_one_hot = torch.zeros(y_tensor.size()[0], n_dims).scatter_(1, y_tensor, 1) + y_one_hot = y_one_hot.view(h,w,n_dims) + return y_one_hot.permute(2, 0, 1).unsqueeze(0) + + +def read_frame_list(video_dir): + frame_list = [img for img in glob.glob(os.path.join(video_dir,"*.jpg"))] + frame_list = sorted(frame_list) + return frame_list + + +def read_frame(frame_dir, scale_size=[480]): + """ + read a single frame & preprocess + """ + img = cv2.imread(frame_dir) + ori_h, ori_w, _ = img.shape + if len(scale_size) == 1: + if(ori_h > ori_w): + tw = scale_size[0] + th = (tw * ori_h) / ori_w + th = int((th // 64) * 64) + else: + th = scale_size[0] + tw = (th * ori_w) / ori_h + tw = int((tw // 64) * 64) + else: + th, tw = scale_size + img = cv2.resize(img, (tw, th)) + img = img.astype(np.float32) + img = img / 255.0 + img = img[:, :, ::-1] + img = np.transpose(img.copy(), (2, 0, 1)) + img = torch.from_numpy(img).float() + img = color_normalize(img) + return img, ori_h, ori_w + + +def read_seg(seg_dir, factor, scale_size=[480]): + seg = Image.open(seg_dir) + _w, _h = seg.size # note PIL.Image.Image's size is (w, h) + if len(scale_size) == 1: + if(_w > _h): + _th = scale_size[0] + _tw = (_th * _w) / _h + _tw = int((_tw // 64) * 64) + else: + _tw = scale_size[0] + _th = (_tw * _h) / _w + _th = int((_th // 64) * 64) + else: + _th = scale_size[1] + _tw = scale_size[0] + small_seg = np.array(seg.resize((_tw // factor, _th // factor), 0)) + small_seg = torch.from_numpy(small_seg.copy()).contiguous().float().unsqueeze(0) + return to_one_hot(small_seg), np.asarray(seg) + + +def color_normalize(x, mean=[0.485, 0.456, 0.406], std=[0.228, 0.224, 0.225]): + for t, m, s in zip(x, mean, std): + t.sub_(m) + t.div_(s) + return x + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('Evaluation with video object segmentation on DAVIS 2017') + parser.add_argument('--pretrained_weights', default='', type=str, help="Path to pretrained weights to evaluate.") + parser.add_argument('--arch', default='vit_small', type=str, + choices=['vit_tiny', 'vit_small', 'vit_base'], help='Architecture (support only ViT atm).') + parser.add_argument('--patch_size', default=16, type=int, help='Patch resolution of the model.') + parser.add_argument("--checkpoint_key", default="teacher", type=str, help='Key to use in the checkpoint (example: "teacher")') + parser.add_argument('--output_dir', default=".", help='Path where to save segmentations') + parser.add_argument('--data_path', default='/path/to/davis/', type=str) + parser.add_argument("--n_last_frames", type=int, default=7, help="number of preceeding frames") + parser.add_argument("--size_mask_neighborhood", default=12, type=int, + help="We restrict the set of source nodes considered to a spatial neighborhood of the query node") + parser.add_argument("--topk", type=int, default=5, help="accumulate label from top k neighbors") + parser.add_argument("--bs", type=int, default=6, help="Batch size, try to reduce if OOM") + args = parser.parse_args() + + print("git:\n {}\n".format(utils.get_sha())) + print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items()))) + + # building network + model = vits.__dict__[args.arch](patch_size=args.patch_size, num_classes=0) + print(f"Model {args.arch} {args.patch_size}x{args.patch_size} built.") + model.cuda() + utils.load_pretrained_weights(model, args.pretrained_weights, args.checkpoint_key, args.arch, args.patch_size) + for param in model.parameters(): + param.requires_grad = False + model.eval() + + color_palette = [] + for line in urlopen("https://raw.githubusercontent.com/Liusifei/UVC/master/libs/data/palette.txt"): + color_palette.append([int(i) for i in line.decode("utf-8").split('\n')[0].split(" ")]) + color_palette = np.asarray(color_palette, dtype=np.uint8).reshape(-1,3) + + video_list = open(os.path.join(args.data_path, "ImageSets/2017/val.txt")).readlines() + for i, video_name in enumerate(video_list): + video_name = video_name.strip() + print(f'[{i}/{len(video_list)}] Begin to segmentate video {video_name}.') + video_dir = os.path.join(args.data_path, "JPEGImages/480p/", video_name) + frame_list = read_frame_list(video_dir) + seg_path = frame_list[0].replace("JPEGImages", "Annotations").replace("jpg", "png") + first_seg, seg_ori = read_seg(seg_path, args.patch_size) + eval_video_tracking_davis(args, model, frame_list, video_dir, first_seg, seg_ori, color_palette) diff --git a/PuzzleTuning/Counterpart PreTrain Methods/dino-main/hubconf.py b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/hubconf.py new file mode 100644 index 0000000000000000000000000000000000000000..3709271ed2b52bb86fbeb70fc02bc47d1add207e --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/hubconf.py @@ -0,0 +1,151 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +from torchvision.models.resnet import resnet50 + +import vision_transformer as vits + +dependencies = ["torch", "torchvision"] + + +def dino_vits16(pretrained=True, **kwargs): + """ + ViT-Small/16x16 pre-trained with DINO. + Achieves 74.5% top-1 accuracy on ImageNet with k-NN classification. + """ + model = vits.__dict__["vit_small"](patch_size=16, num_classes=0, **kwargs) + if pretrained: + state_dict = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/dino/dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth", + map_location="cpu", + ) + model.load_state_dict(state_dict, strict=True) + return model + + +def dino_vits8(pretrained=True, **kwargs): + """ + ViT-Small/8x8 pre-trained with DINO. + Achieves 78.3% top-1 accuracy on ImageNet with k-NN classification. + """ + model = vits.__dict__["vit_small"](patch_size=8, num_classes=0, **kwargs) + if pretrained: + state_dict = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/dino/dino_deitsmall8_pretrain/dino_deitsmall8_pretrain.pth", + map_location="cpu", + ) + model.load_state_dict(state_dict, strict=True) + return model + + +def dino_vitb16(pretrained=True, **kwargs): + """ + ViT-Base/16x16 pre-trained with DINO. + Achieves 76.1% top-1 accuracy on ImageNet with k-NN classification. + """ + model = vits.__dict__["vit_base"](patch_size=16, num_classes=0, **kwargs) + if pretrained: + state_dict = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/dino/dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth", + map_location="cpu", + ) + model.load_state_dict(state_dict, strict=True) + return model + + +def dino_vitb8(pretrained=True, **kwargs): + """ + ViT-Base/8x8 pre-trained with DINO. + Achieves 77.4% top-1 accuracy on ImageNet with k-NN classification. + """ + model = vits.__dict__["vit_base"](patch_size=8, num_classes=0, **kwargs) + if pretrained: + state_dict = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/dino/dino_vitbase8_pretrain/dino_vitbase8_pretrain.pth", + map_location="cpu", + ) + model.load_state_dict(state_dict, strict=True) + return model + + +def dino_resnet50(pretrained=True, **kwargs): + """ + ResNet-50 pre-trained with DINO. + Achieves 75.3% top-1 accuracy on ImageNet linear evaluation benchmark (requires to train `fc`). + """ + model = resnet50(pretrained=False, **kwargs) + model.fc = torch.nn.Identity() + if pretrained: + state_dict = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/dino/dino_resnet50_pretrain/dino_resnet50_pretrain.pth", + map_location="cpu", + ) + model.load_state_dict(state_dict, strict=False) + return model + + +def dino_xcit_small_12_p16(pretrained=True, **kwargs): + """ + XCiT-Small-12/16 pre-trained with DINO. + """ + model = torch.hub.load('facebookresearch/xcit:main', "xcit_small_12_p16", num_classes=0, **kwargs) + if pretrained: + state_dict = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/dino/dino_xcit_small_12_p16_pretrain/dino_xcit_small_12_p16_pretrain.pth", + map_location="cpu", + ) + model.load_state_dict(state_dict, strict=True) + return model + + +def dino_xcit_small_12_p8(pretrained=True, **kwargs): + """ + XCiT-Small-12/8 pre-trained with DINO. + """ + model = torch.hub.load('facebookresearch/xcit:main', "xcit_small_12_p8", num_classes=0, **kwargs) + if pretrained: + state_dict = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/dino/dino_xcit_small_12_p8_pretrain/dino_xcit_small_12_p8_pretrain.pth", + map_location="cpu", + ) + model.load_state_dict(state_dict, strict=True) + return model + + +def dino_xcit_medium_24_p16(pretrained=True, **kwargs): + """ + XCiT-Medium-24/16 pre-trained with DINO. + """ + model = torch.hub.load('facebookresearch/xcit:main', "xcit_medium_24_p16", num_classes=0, **kwargs) + if pretrained: + state_dict = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/dino/dino_xcit_medium_24_p16_pretrain/dino_xcit_medium_24_p16_pretrain.pth", + map_location="cpu", + ) + model.load_state_dict(state_dict, strict=True) + return model + + +def dino_xcit_medium_24_p8(pretrained=True, **kwargs): + """ + XCiT-Medium-24/8 pre-trained with DINO. + """ + model = torch.hub.load('facebookresearch/xcit:main', "xcit_medium_24_p8", num_classes=0, **kwargs) + if pretrained: + state_dict = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/dino/dino_xcit_medium_24_p8_pretrain/dino_xcit_medium_24_p8_pretrain.pth", + map_location="cpu", + ) + model.load_state_dict(state_dict, strict=True) + return model diff --git a/PuzzleTuning/Counterpart PreTrain Methods/dino-main/main_dino.py b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/main_dino.py new file mode 100644 index 0000000000000000000000000000000000000000..e04ec9dc125423c4bb46779e7b27c181da378844 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/main_dino.py @@ -0,0 +1,515 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +import sys +import datetime +import time +import math +import json +from pathlib import Path + +import numpy as np +from PIL import Image +import torch +import torch.nn as nn +import torch.distributed as dist +import torch.backends.cudnn as cudnn +import torch.nn.functional as F +from torchvision import datasets, transforms +from torchvision import models as torchvision_models + +import utils +import vision_transformer as vits +from vision_transformer import DINOHead + +torchvision_archs = sorted(name for name in torchvision_models.__dict__ + if name.islower() and not name.startswith("__") + and callable(torchvision_models.__dict__[name])) + +def get_args_parser(): + parser = argparse.ArgumentParser('DINO', add_help=False) + + # Model parameters + parser.add_argument('--arch', default='vit_base', type=str, + help="""Name of architecture to train. For quick experiments with ViTs, + we recommend using vit_tiny or vit_small.""") + parser.add_argument('--patch_size', default=16, type=int, help="""Size in pixels + of input square patches - default 16 (for 16x16 patches). Using smaller + values leads to better performance but requires more memory. Applies only + for ViTs (vit_tiny, vit_small and vit_base). If <16, we recommend disabling + mixed precision training (--use_fp16 false) to avoid unstabilities.""") + parser.add_argument('--input_size', default=224, type=int) + parser.add_argument('--out_dim', default=65536, type=int, help="""Dimensionality of + the DINO head output. For complex and large datasets large values (like 65k) work well.""") + parser.add_argument('--norm_last_layer', default=True, type=utils.bool_flag, + help="""Whether or not to weight normalize the last layer of the DINO head. + Not normalizing leads to better performance but can make the training unstable. + In our experiments, we typically set this paramater to False with vit_small and True with vit_base.""") + parser.add_argument('--momentum_teacher', default=0.996, type=float, help="""Base EMA + parameter for teacher update. The value is increased to 1 during training with cosine schedule. + We recommend setting a higher value with small batches: for example use 0.9995 with batch size of 256.""") + parser.add_argument('--use_bn_in_head', default=False, type=utils.bool_flag, + help="Whether to use batch normalizations in projection head (Default: False)") + + # Temperature teacher parameters + parser.add_argument('--warmup_teacher_temp', default=0.04, type=float, + help="""Initial value for the teacher temperature: 0.04 works well in most cases. + Try decreasing it if the training loss does not decrease.""") + parser.add_argument('--teacher_temp', default=0.04, type=float, help="""Final value (after linear warmup) + of the teacher temperature. For most experiments, anything above 0.07 is unstable. We recommend + starting with the default value of 0.04 and increase this slightly if needed.""") + parser.add_argument('--warmup_teacher_temp_epochs', default=0, type=int, + help='Number of warmup epochs for the teacher temperature (Default: 30).') + + # Training/Optimization parameters + parser.add_argument('--use_fp16', type=utils.bool_flag, default=True, help="""Whether or not + to use half precision for training. Improves training time and memory requirements, + but can provoke instability and slight decay of performance. We recommend disabling + mixed precision if the loss is unstable, if reducing the patch size or if training with bigger ViTs.""") + parser.add_argument('--weight_decay', type=float, default=0.04, help="""Initial value of the + weight decay. With ViT, a smaller value at the beginning of training works well.""") + parser.add_argument('--weight_decay_end', type=float, default=0.4, help="""Final value of the + weight decay. We use a cosine schedule for WD and using a larger decay by + the end of training improves performance for ViTs.""") + parser.add_argument('--clip_grad', type=float, default=3.0, help="""Maximal parameter + gradient norm if using gradient clipping. Clipping with norm .3 ~ 1.0 can + help optimization for larger ViT architectures. 0 for disabling.""") + parser.add_argument('--batch_size_per_gpu', default=512, type=int, + help='Per-GPU batch-size : number of distinct images loaded on one GPU.') + parser.add_argument('--epochs', default=150, type=int, help='Number of epochs of training.') + parser.add_argument('--freeze_last_layer', default=1, type=int, help="""Number of epochs + during which we keep the output layer fixed. Typically doing so during + the first epoch helps training. Try increasing this value if the loss does not decrease.""") + parser.add_argument("--lr", default=1.5e-4, type=float, help="""Learning rate at the end of + linear warmup (highest LR used during training). The learning rate is linearly scaled + with the batch size, and specified here for a reference batch size of 512.""") + parser.add_argument("--warmup_epochs", default=20, type=int, + help="Number of epochs for the linear learning-rate warm up.") + parser.add_argument('--min_lr', type=float, default=1e-6, help="""Target LR at the + end of optimization. We use a cosine LR schedule with linear warmup.""") + parser.add_argument('--optimizer', default='adamw', type=str, + choices=['adamw', 'sgd', 'lars'], help="""Type of optimizer. We recommend using adamw with ViTs.""") + parser.add_argument('--drop_path_rate', type=float, default=0.1, help="stochastic depth rate") + + # Multi-crop parameters + parser.add_argument('--global_crops_scale', type=float, nargs='+', default=(0.4, 1.), + help="""Scale range of the cropped image before resizing, relatively to the origin image. + Used for large global view cropping. When disabling multi-crop (--local_crops_number 0), we + recommand using a wider range of scale ("--global_crops_scale 0.14 1." for example)""") + parser.add_argument('--local_crops_number', type=int, default=8, help="""Number of small + local views to generate. Set this parameter to 0 to disable multi-crop training. + When disabling multi-crop we recommend to use "--global_crops_scale 0.14 1." """) + parser.add_argument('--local_crops_scale', type=float, nargs='+', default=(0.05, 0.4), + help="""Scale range of the cropped image before resizing, relatively to the origin image. + Used for small local view cropping of multi-crop.""") + + # Misc + parser.add_argument('--data_path', default='/root/autodl-tmp/All', type=str, + help='Please specify path to the ImageNet training data.') + parser.add_argument('--basic_state_dict', default='/root/autodl-tmp/ViT_b16_224_Imagenet.pth', type=str, + help='Load in pretrained or un-pretrained model pth') + parser.add_argument('--output_dir', default="/home/CPIA/saved_models/DINO", type=str, help='Path to save logs and checkpoints.') + parser.add_argument('--saveckp_freq', default=50, type=int, help='Save checkpoint every x epochs.') + parser.add_argument('--seed', default=0, type=int, help='Random seed.') + parser.add_argument('--num_workers', default=32, type=int, help='Number of data loading workers per GPU.') + parser.add_argument("--dist_url", default="env://", type=str, help="""url used to set up + distributed training; see https://pytorch.org/docs/stable/distributed.html""") + parser.add_argument("--local_rank", default=0, type=int, help="Please ignore and do not set this argument.") + return parser + + +def train_dino(args): + utils.init_distributed_mode(args) + utils.fix_random_seeds(args.seed) + print('ok') + print("git:\n {}\n".format(utils.get_sha())) + print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items()))) + cudnn.benchmark = True + + # ============ preparing data ... ============ + transform = DataAugmentationDINO( + args.global_crops_scale, + args.local_crops_scale, + args.local_crops_number, + ) + dataset = datasets.ImageFolder(args.data_path, transform=transform) + sampler = torch.utils.data.DistributedSampler(dataset, shuffle=True) + data_loader = torch.utils.data.DataLoader( + dataset, + sampler=sampler, + batch_size=args.batch_size_per_gpu, + num_workers=args.num_workers, + pin_memory=True, + drop_last=True, + ) + print(f"Data loaded: there are {len(dataset)} images.") + + # ============ building student and teacher networks ... ============ + # we changed the name DeiT-S for ViT-S to avoid confusions + args.arch = args.arch.replace("deit", "vit") + # if the network is a Vision Transformer (i.e. vit_tiny, vit_small, vit_base) + if args.arch in vits.__dict__.keys(): + student = vits.__dict__[args.arch]( + patch_size=args.patch_size, + drop_path_rate=args.drop_path_rate, # stochastic depth + ) + if args.basic_state_dict is not None: # Transfer-learning + try: + basic_state_dict = torch.load(args.basic_state_dict) + if 'model' in basic_state_dict: + basic_state_dict = basic_state_dict['model'] + else: + pass + student.load_state_dict(basic_state_dict, False) + + except: + print('erro in args.basic_state_dict:', args.basic_state_dict) + print('Student PreTuning Restart') # 没倒进去 + + else: + print('Student PreTuning with Transfer-learning with:', args.basic_state_dict) + + else: + print('Student PreTuning Restart') + teacher = vits.__dict__[args.arch](patch_size=args.patch_size) + if args.basic_state_dict is not None: # Transfer-learning + try: + basic_state_dict = torch.load(args.basic_state_dict) + if 'model' in basic_state_dict: + basic_state_dict = basic_state_dict['model'] + else: + pass + teacher.load_state_dict(basic_state_dict, False) + + except: + print('erro in args.basic_state_dict:', args.basic_state_dict) + print('Teacher PreTuning Restart') # 没倒进去 + + else: + print('Teacher PreTuning with Transfer-learning with:', args.basic_state_dict) + + else: + print('Teacher PreTuning Restart') + embed_dim = student.embed_dim + # if the network is a XCiT + """elif args.arch in torch.hub.list("facebookresearch/xcit:main"): + student = torch.hub.load('facebookresearch/xcit:main', args.arch, + pretrained=False, drop_path_rate=args.drop_path_rate) + teacher = torch.hub.load('facebookresearch/xcit:main', args.arch, pretrained=False) + embed_dim = student.embed_dim + # otherwise, we check if the architecture is in torchvision models + elif args.arch in torchvision_models.__dict__.keys(): + student = torchvision_models.__dict__[args.arch]() + teacher = torchvision_models.__dict__[args.arch]() + embed_dim = student.fc.weight.shape[1] + else: + print(f"Unknow architecture: {args.arch}")""" + + # multi-crop wrapper handles forward with inputs of different resolutions + student = utils.MultiCropWrapper(student, DINOHead( + embed_dim, + args.out_dim, + use_bn=args.use_bn_in_head, + norm_last_layer=args.norm_last_layer, + )) + teacher = utils.MultiCropWrapper( + teacher, + DINOHead(embed_dim, args.out_dim, args.use_bn_in_head), + ) + # move networks to gpu + student, teacher = student.cuda(), teacher.cuda() + # synchronize batch norms (if any) + if utils.has_batchnorms(student): + student = nn.SyncBatchNorm.convert_sync_batchnorm(student) + teacher = nn.SyncBatchNorm.convert_sync_batchnorm(teacher) + + # we need DDP wrapper to have synchro batch norms working... + teacher = nn.parallel.DistributedDataParallel(teacher, device_ids=[args.gpu]) + teacher_without_ddp = teacher.module + else: + # teacher_without_ddp and teacher are the same thing + teacher_without_ddp = teacher + student = nn.parallel.DistributedDataParallel(student, device_ids=[args.gpu]) + # teacher and student start with the same weights + teacher_without_ddp.load_state_dict(student.module.state_dict()) + # there is no backpropagation through the teacher, so no need for gradients + for p in teacher.parameters(): + p.requires_grad = False + print(f"Student and Teacher are built: they are both {args.arch} network.") + + # ============ preparing loss ... ============ + dino_loss = DINOLoss( + args.out_dim, + args.local_crops_number + 2, # total number of crops = 2 global crops + local_crops_number + args.warmup_teacher_temp, + args.teacher_temp, + args.warmup_teacher_temp_epochs, + args.epochs, + ).cuda() + + # ============ preparing optimizer ... ============ + params_groups = utils.get_params_groups(student) + if args.optimizer == "adamw": + optimizer = torch.optim.AdamW(params_groups) # to use with ViTs + elif args.optimizer == "sgd": + optimizer = torch.optim.SGD(params_groups, lr=0, momentum=0.9) # lr is set by scheduler + elif args.optimizer == "lars": + optimizer = utils.LARS(params_groups) # to use with convnet and large batches + # for mixed precision training + fp16_scaler = None + if args.use_fp16: + fp16_scaler = torch.cuda.amp.GradScaler() + + # ============ init schedulers ... ============ + lr_schedule = utils.cosine_scheduler( + args.lr * (args.batch_size_per_gpu * utils.get_world_size()) / 256., # linear scaling rule + args.min_lr, + args.epochs, len(data_loader), + warmup_epochs=args.warmup_epochs, + ) + wd_schedule = utils.cosine_scheduler( + args.weight_decay, + args.weight_decay_end, + args.epochs, len(data_loader), + ) + # momentum parameter is increased to 1. during training with a cosine schedule + momentum_schedule = utils.cosine_scheduler(args.momentum_teacher, 1, + args.epochs, len(data_loader)) + print(f"Loss, optimizer and schedulers ready.") + + # ============ optionally resume training ... ============ + to_restore = {"epoch": 0} + utils.restart_from_checkpoint( + os.path.join(args.output_dir, "checkpoint.pth"), + run_variables=to_restore, + student=student, + teacher=teacher, + optimizer=optimizer, + fp16_scaler=fp16_scaler, + dino_loss=dino_loss, + ) + start_epoch = to_restore["epoch"] + + start_time = time.time() + print("Starting DINO training !") + for epoch in range(start_epoch, args.epochs): + data_loader.sampler.set_epoch(epoch) + + # ============ training one epoch of DINO ... ============ + train_stats = train_one_epoch(student, teacher, teacher_without_ddp, dino_loss, + data_loader, optimizer, lr_schedule, wd_schedule, momentum_schedule, + epoch, fp16_scaler, args) + + # ============ writing logs ... ============ + save_dict = { + 'student': student.state_dict(), + 'teacher': teacher.state_dict(), + 'optimizer': optimizer.state_dict(), + 'epoch': epoch + 1, + 'args': args, + 'dino_loss': dino_loss.state_dict(), + } + if fp16_scaler is not None: + save_dict['fp16_scaler'] = fp16_scaler.state_dict() + utils.save_on_master(save_dict, os.path.join(args.output_dir, 'checkpoint.pth')) + if args.saveckp_freq and (epoch % args.saveckp_freq == 0 or epoch + 1 == args.epochs): + initial_setting = os.path.split(args.basic_state_dict)[1].split('.')[0] # 'ViT_b16_224_Imagenet' + dataset_using = os.path.split(args.data_path)[1] + utils.save_on_master(save_dict, os.path.join(args.output_dir, f'dino_'+initial_setting + '_' + dataset_using+f'_checkpoint{epoch:04}.pth')) + + + + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + 'epoch': epoch} + if utils.is_main_process(): + with (Path(args.output_dir) / "log.txt").open("a") as f: + f.write(json.dumps(log_stats) + "\n") + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + +def train_one_epoch(student, teacher, teacher_without_ddp, dino_loss, data_loader, + optimizer, lr_schedule, wd_schedule, momentum_schedule,epoch, + fp16_scaler, args): + metric_logger = utils.MetricLogger(delimiter=" ") + header = 'Epoch: [{}/{}]'.format(epoch, args.epochs) + for it, (images, _) in enumerate(metric_logger.log_every(data_loader, 10, header)): + # update weight decay and learning rate according to their schedule + it = len(data_loader) * epoch + it # global training iteration + for i, param_group in enumerate(optimizer.param_groups): + param_group["lr"] = lr_schedule[it] + if i == 0: # only the first group is regularized + param_group["weight_decay"] = wd_schedule[it] + + # move images to gpu + images = [im.cuda(non_blocking=True) for im in images] + # teacher and student forward passes + compute dino loss + with torch.cuda.amp.autocast(fp16_scaler is not None): + teacher_output = teacher(images[:2]) # only the 2 global views pass through the teacher + student_output = student(images) + loss = dino_loss(student_output, teacher_output, epoch) + + if not math.isfinite(loss.item()): + print("Loss is {}, stopping training".format(loss.item()), force=True) + sys.exit(1) + + # student update + optimizer.zero_grad() + param_norms = None + if fp16_scaler is None: + loss.backward() + if args.clip_grad: + param_norms = utils.clip_gradients(student, args.clip_grad) + utils.cancel_gradients_last_layer(epoch, student, + args.freeze_last_layer) + optimizer.step() + else: + fp16_scaler.scale(loss).backward() + if args.clip_grad: + fp16_scaler.unscale_(optimizer) # unscale the gradients of optimizer's assigned params in-place + param_norms = utils.clip_gradients(student, args.clip_grad) + utils.cancel_gradients_last_layer(epoch, student, + args.freeze_last_layer) + fp16_scaler.step(optimizer) + fp16_scaler.update() + + # EMA update for the teacher + with torch.no_grad(): + m = momentum_schedule[it] # momentum parameter + for param_q, param_k in zip(student.module.parameters(), teacher_without_ddp.parameters()): + param_k.data.mul_(m).add_((1 - m) * param_q.detach().data) + + # logging + torch.cuda.synchronize() + metric_logger.update(loss=loss.item()) + metric_logger.update(lr=optimizer.param_groups[0]["lr"]) + metric_logger.update(wd=optimizer.param_groups[0]["weight_decay"]) + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} + + +class DINOLoss(nn.Module): + def __init__(self, out_dim, ncrops, warmup_teacher_temp, teacher_temp, + warmup_teacher_temp_epochs, nepochs, student_temp=0.1, + center_momentum=0.9): + super().__init__() + self.student_temp = student_temp + self.center_momentum = center_momentum + self.ncrops = ncrops + self.register_buffer("center", torch.zeros(1, out_dim)) + # we apply a warm up for the teacher temperature because + # a too high temperature makes the training instable at the beginning + self.teacher_temp_schedule = np.concatenate(( + np.linspace(warmup_teacher_temp, + teacher_temp, warmup_teacher_temp_epochs), + np.ones(nepochs - warmup_teacher_temp_epochs) * teacher_temp + )) + + def forward(self, student_output, teacher_output, epoch): + """ + Cross-entropy between softmax outputs of the teacher and student networks. + """ + student_out = student_output / self.student_temp + student_out = student_out.chunk(self.ncrops) + + # teacher centering and sharpening + temp = self.teacher_temp_schedule[epoch] + teacher_out = F.softmax((teacher_output - self.center) / temp, dim=-1) + teacher_out = teacher_out.detach().chunk(2) + + total_loss = 0 + n_loss_terms = 0 + for iq, q in enumerate(teacher_out): + for v in range(len(student_out)): + if v == iq: + # we skip cases where student and teacher operate on the same view + continue + loss = torch.sum(-q * F.log_softmax(student_out[v], dim=-1), dim=-1) + total_loss += loss.mean() + n_loss_terms += 1 + total_loss /= n_loss_terms + self.update_center(teacher_output) + return total_loss + + @torch.no_grad() + def update_center(self, teacher_output): + """ + Update center used for teacher output. + """ + batch_center = torch.sum(teacher_output, dim=0, keepdim=True) + dist.all_reduce(batch_center) + batch_center = batch_center / (len(teacher_output) * dist.get_world_size()) + + # ema update + self.center = self.center * self.center_momentum + batch_center * (1 - self.center_momentum) + + +class DataAugmentationDINO(object): + def __init__(self, global_crops_scale, local_crops_scale, local_crops_number): + flip_and_color_jitter = transforms.Compose([ + transforms.RandomHorizontalFlip(p=0.5), + transforms.RandomApply( + [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)], + p=0.8 + ), + transforms.RandomGrayscale(p=0.2), + ]) + normalize = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) + + # first global crop + self.global_transfo1 = transforms.Compose([ + transforms.RandomResizedCrop(224, scale=global_crops_scale, interpolation=Image.BICUBIC), + flip_and_color_jitter, + utils.GaussianBlur(1.0), + normalize, + ]) + # second global crop + self.global_transfo2 = transforms.Compose([ + transforms.RandomResizedCrop(224, scale=global_crops_scale, interpolation=Image.BICUBIC), + flip_and_color_jitter, + utils.GaussianBlur(0.1), + utils.Solarization(0.2), + normalize, + ]) + # transformation for the local small crops + self.local_crops_number = local_crops_number + self.local_transfo = transforms.Compose([ + transforms.RandomResizedCrop(96, scale=local_crops_scale, interpolation=Image.BICUBIC), + flip_and_color_jitter, + utils.GaussianBlur(p=0.5), + normalize, + ]) + + def __call__(self, image): + crops = [] + crops.append(self.global_transfo1(image)) + crops.append(self.global_transfo2(image)) + for _ in range(self.local_crops_number): + crops.append(self.local_transfo(image)) + return crops + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('DINO', parents=[get_args_parser()]) + args = parser.parse_args() + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + train_dino(args) diff --git a/PuzzleTuning/Counterpart PreTrain Methods/dino-main/run_with_submitit.py b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/run_with_submitit.py new file mode 100644 index 0000000000000000000000000000000000000000..33d4116f2ff512b39d0cec5c936f999df1ac80fe --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/run_with_submitit.py @@ -0,0 +1,132 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +A script to run multinode training with submitit. +Almost copy-paste from https://github.com/facebookresearch/deit/blob/main/run_with_submitit.py +""" +import argparse +import os +import uuid +from pathlib import Path + +import main_dino +import submitit + + +def parse_args(): + parser = argparse.ArgumentParser("Submitit for DINO", parents=[main_dino.get_args_parser()]) + parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node") + parser.add_argument("--nodes", default=2, type=int, help="Number of nodes to request") + parser.add_argument("--timeout", default=2800, type=int, help="Duration of the job") + + parser.add_argument("--partition", default="learnfair", type=str, help="Partition where to submit") + parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this") + parser.add_argument('--comment', default="", type=str, + help='Comment to pass to scheduler, e.g. priority message') + return parser.parse_args() + + +def get_shared_folder() -> Path: + user = os.getenv("USER") + if Path("/checkpoint/").is_dir(): + p = Path(f"/checkpoint/{user}/experiments") + p.mkdir(exist_ok=True) + return p + raise RuntimeError("No shared folder available") + + +def get_init_file(): + # Init file must not exist, but it's parent dir must exist. + os.makedirs(str(get_shared_folder()), exist_ok=True) + init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init" + if init_file.exists(): + os.remove(str(init_file)) + return init_file + + +class Trainer(object): + def __init__(self, args): + self.args = args + + def __call__(self): + import main_dino + + self._setup_gpu_args() + main_dino.train_dino(self.args) + + def checkpoint(self): + import os + import submitit + + self.args.dist_url = get_init_file().as_uri() + print("Requeuing ", self.args) + empty_trainer = type(self)(self.args) + return submitit.helpers.DelayedSubmission(empty_trainer) + + def _setup_gpu_args(self): + import submitit + from pathlib import Path + + job_env = submitit.JobEnvironment() + self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id))) + self.args.gpu = job_env.local_rank + self.args.rank = job_env.global_rank + self.args.world_size = job_env.num_tasks + print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") + + +def main(): + args = parse_args() + if args.output_dir == "": + args.output_dir = get_shared_folder() / "%j" + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + executor = submitit.AutoExecutor(folder=args.output_dir, slurm_max_num_timeout=30) + + num_gpus_per_node = args.ngpus + nodes = args.nodes + timeout_min = args.timeout + + partition = args.partition + kwargs = {} + if args.use_volta32: + kwargs['slurm_constraint'] = 'volta32gb' + if args.comment: + kwargs['slurm_comment'] = args.comment + + executor.update_parameters( + mem_gb=40 * num_gpus_per_node, + gpus_per_node=num_gpus_per_node, + tasks_per_node=num_gpus_per_node, # one task per GPU + cpus_per_task=10, + nodes=nodes, + timeout_min=timeout_min, # max is 60 * 72 + # Below are cluster dependent parameters + slurm_partition=partition, + slurm_signal_delay_s=120, + **kwargs + ) + + executor.update_parameters(name="dino") + + args.dist_url = get_init_file().as_uri() + + trainer = Trainer(args) + job = executor.submit(trainer) + + print(f"Submitted job_id: {job.job_id}") + print(f"Logs and checkpoints will be saved at: {args.output_dir}") + + +if __name__ == "__main__": + main() diff --git a/PuzzleTuning/Counterpart PreTrain Methods/dino-main/utils.py b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9586250123a125a83ea2679e121b1b0ef8089916 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/utils.py @@ -0,0 +1,829 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Misc functions. + +Mostly copy-paste from torchvision references or other public repos like DETR: +https://github.com/facebookresearch/detr/blob/master/util/misc.py +""" +import os +import sys +import time +import math +import random +import datetime +import subprocess +from collections import defaultdict, deque + +import numpy as np +import torch +from torch import nn +import torch.distributed as dist +from PIL import ImageFilter, ImageOps + + +class GaussianBlur(object): + """ + Apply Gaussian Blur to the PIL image. + """ + def __init__(self, p=0.5, radius_min=0.1, radius_max=2.): + self.prob = p + self.radius_min = radius_min + self.radius_max = radius_max + + def __call__(self, img): + do_it = random.random() <= self.prob + if not do_it: + return img + + return img.filter( + ImageFilter.GaussianBlur( + radius=random.uniform(self.radius_min, self.radius_max) + ) + ) + + +class Solarization(object): + """ + Apply Solarization to the PIL image. + """ + def __init__(self, p): + self.p = p + + def __call__(self, img): + if random.random() < self.p: + return ImageOps.solarize(img) + else: + return img + + +def load_pretrained_weights(model, pretrained_weights, checkpoint_key, model_name, patch_size): + if os.path.isfile(pretrained_weights): + state_dict = torch.load(pretrained_weights, map_location="cpu") + if checkpoint_key is not None and checkpoint_key in state_dict: + print(f"Take key {checkpoint_key} in provided checkpoint dict") + state_dict = state_dict[checkpoint_key] + # remove `module.` prefix + state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} + # remove `backbone.` prefix induced by multicrop wrapper + state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()} + msg = model.load_state_dict(state_dict, strict=False) + print('Pretrained weights found at {} and loaded with msg: {}'.format(pretrained_weights, msg)) + else: + print("Please use the `--pretrained_weights` argument to indicate the path of the checkpoint to evaluate.") + url = None + if model_name == "vit_small" and patch_size == 16: + url = "dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth" + elif model_name == "vit_small" and patch_size == 8: + url = "dino_deitsmall8_pretrain/dino_deitsmall8_pretrain.pth" + elif model_name == "vit_base" and patch_size == 16: + url = "dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth" + elif model_name == "vit_base" and patch_size == 8: + url = "dino_vitbase8_pretrain/dino_vitbase8_pretrain.pth" + elif model_name == "xcit_small_12_p16": + url = "dino_xcit_small_12_p16_pretrain/dino_xcit_small_12_p16_pretrain.pth" + elif model_name == "xcit_small_12_p8": + url = "dino_xcit_small_12_p8_pretrain/dino_xcit_small_12_p8_pretrain.pth" + elif model_name == "xcit_medium_24_p16": + url = "dino_xcit_medium_24_p16_pretrain/dino_xcit_medium_24_p16_pretrain.pth" + elif model_name == "xcit_medium_24_p8": + url = "dino_xcit_medium_24_p8_pretrain/dino_xcit_medium_24_p8_pretrain.pth" + elif model_name == "resnet50": + url = "dino_resnet50_pretrain/dino_resnet50_pretrain.pth" + if url is not None: + print("Since no pretrained weights have been provided, we load the reference pretrained DINO weights.") + state_dict = torch.hub.load_state_dict_from_url(url="https://dl.fbaipublicfiles.com/dino/" + url) + model.load_state_dict(state_dict, strict=True) + else: + print("There is no reference weights available for this model => We use random weights.") + + +def load_pretrained_linear_weights(linear_classifier, model_name, patch_size): + url = None + if model_name == "vit_small" and patch_size == 16: + url = "dino_deitsmall16_pretrain/dino_deitsmall16_linearweights.pth" + elif model_name == "vit_small" and patch_size == 8: + url = "dino_deitsmall8_pretrain/dino_deitsmall8_linearweights.pth" + elif model_name == "vit_base" and patch_size == 16: + url = "dino_vitbase16_pretrain/dino_vitbase16_linearweights.pth" + elif model_name == "vit_base" and patch_size == 8: + url = "dino_vitbase8_pretrain/dino_vitbase8_linearweights.pth" + elif model_name == "resnet50": + url = "dino_resnet50_pretrain/dino_resnet50_linearweights.pth" + if url is not None: + print("We load the reference pretrained linear weights.") + state_dict = torch.hub.load_state_dict_from_url(url="https://dl.fbaipublicfiles.com/dino/" + url)["state_dict"] + linear_classifier.load_state_dict(state_dict, strict=True) + else: + print("We use random linear weights.") + + +def clip_gradients(model, clip): + norms = [] + for name, p in model.named_parameters(): + if p.grad is not None: + param_norm = p.grad.data.norm(2) + norms.append(param_norm.item()) + clip_coef = clip / (param_norm + 1e-6) + if clip_coef < 1: + p.grad.data.mul_(clip_coef) + return norms + + +def cancel_gradients_last_layer(epoch, model, freeze_last_layer): + if epoch >= freeze_last_layer: + return + for n, p in model.named_parameters(): + if "last_layer" in n: + p.grad = None + + +def restart_from_checkpoint(ckp_path, run_variables=None, **kwargs): + """ + Re-start from checkpoint + """ + if not os.path.isfile(ckp_path): + return + print("Found checkpoint at {}".format(ckp_path)) + + # open checkpoint file + checkpoint = torch.load(ckp_path, map_location="cpu") + + # key is what to look for in the checkpoint file + # value is the object to load + # example: {'state_dict': model} + for key, value in kwargs.items(): + if key in checkpoint and value is not None: + try: + msg = value.load_state_dict(checkpoint[key], strict=False) + print("=> loaded '{}' from checkpoint '{}' with msg {}".format(key, ckp_path, msg)) + except TypeError: + try: + msg = value.load_state_dict(checkpoint[key]) + print("=> loaded '{}' from checkpoint: '{}'".format(key, ckp_path)) + except ValueError: + print("=> failed to load '{}' from checkpoint: '{}'".format(key, ckp_path)) + else: + print("=> key '{}' not found in checkpoint: '{}'".format(key, ckp_path)) + + # re load variable important for the run + if run_variables is not None: + for var_name in run_variables: + if var_name in checkpoint: + run_variables[var_name] = checkpoint[var_name] + + +def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warmup_epochs=0, start_warmup_value=0): + warmup_schedule = np.array([]) + warmup_iters = warmup_epochs * niter_per_ep + if warmup_epochs > 0: + warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters) + + iters = np.arange(epochs * niter_per_ep - warmup_iters) + schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters))) + + schedule = np.concatenate((warmup_schedule, schedule)) + assert len(schedule) == epochs * niter_per_ep + return schedule + + +def bool_flag(s): + """ + Parse boolean arguments from the command line. + """ + FALSY_STRINGS = {"off", "false", "0"} + TRUTHY_STRINGS = {"on", "true", "1"} + if s.lower() in FALSY_STRINGS: + return False + elif s.lower() in TRUTHY_STRINGS: + return True + else: + raise argparse.ArgumentTypeError("invalid value for a boolean flag") + + +def fix_random_seeds(seed=31): + """ + Fix random seeds. + """ + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + np.random.seed(seed) + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.6f} ({global_avg:.6f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + if not is_dist_avail_and_initialized(): + return + t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') + dist.barrier() + dist.all_reduce(t) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value) + + +def reduce_dict(input_dict, average=True): + """ + Args: + input_dict (dict): all the values will be reduced + average (bool): whether to do average or sum + Reduce the values in the dictionary from all processes so that all processes + have the averaged results. Returns a dict with the same fields as + input_dict, after reduction. + """ + world_size = get_world_size() + if world_size < 2: + return input_dict + with torch.no_grad(): + names = [] + values = [] + # sort the keys so that they are consistent across processes + for k in sorted(input_dict.keys()): + names.append(k) + values.append(input_dict[k]) + values = torch.stack(values, dim=0) + dist.all_reduce(values) + if average: + values /= world_size + reduced_dict = {k: v for k, v in zip(names, values)} + return reduced_dict + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append( + "{}: {}".format(name, str(meter)) + ) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every(self, iterable, print_freq, header=None): + i = 0 + if not header: + header = '' + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt='{avg:.6f}') + data_time = SmoothedValue(fmt='{avg:.6f}') + space_fmt = ':' + str(len(str(len(iterable)))) + 'd' + if torch.cuda.is_available(): + log_msg = self.delimiter.join([ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}', + 'max mem: {memory:.0f}' + ]) + else: + log_msg = self.delimiter.join([ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}' + ]) + MB = 1024.0 * 1024.0 + for obj in iterable: + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if i % print_freq == 0 or i == len(iterable) - 1: + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + if torch.cuda.is_available(): + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB)) + else: + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time))) + i += 1 + end = time.time() + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('{} Total time: {} ({:.6f} s / it)'.format( + header, total_time_str, total_time / len(iterable))) + + +def get_sha(): + cwd = os.path.dirname(os.path.abspath(__file__)) + + def _run(command): + return subprocess.check_output(command, cwd=cwd).decode('ascii').strip() + sha = 'N/A' + diff = "clean" + branch = 'N/A' + try: + sha = _run(['git', 'rev-parse', 'HEAD']) + subprocess.check_output(['git', 'diff'], cwd=cwd) + diff = _run(['git', 'diff-index', 'HEAD']) + diff = "has uncommited changes" if diff else "clean" + branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD']) + except Exception: + pass + message = f"sha: {sha}, status: {diff}, branch: {branch}" + return message + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(*args, **kwargs): + if is_main_process(): + torch.save(*args, **kwargs) + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + import builtins as __builtin__ + builtin_print = __builtin__.print + + def print(*args, **kwargs): + force = kwargs.pop('force', False) + if is_master or force: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def init_distributed_mode(args): + # launched with torch.distributed.launch + if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ['WORLD_SIZE']) + args.gpu = int(os.environ['LOCAL_RANK']) + # launched with submitit on a slurm cluster + elif 'SLURM_PROCID' in os.environ: + args.rank = int(os.environ['SLURM_PROCID']) + args.gpu = args.rank % torch.cuda.device_count() + # launched naively with `python main_dino.py` + # we manually add MASTER_ADDR and MASTER_PORT to env variables + elif torch.cuda.is_available(): + print('Will run the code on one GPU.') + args.rank, args.gpu, args.world_size = 0, 0, 1 + os.environ['MASTER_ADDR'] = '127.0.0.1' + os.environ['MASTER_PORT'] = '29500' + else: + print('Does not support training without GPU.') + sys.exit(1) + + dist.init_process_group( + backend="nccl", + init_method=args.dist_url, + world_size=args.world_size, + rank=args.rank, + ) + + torch.cuda.set_device(args.gpu) + print('| distributed init (rank {}): {}'.format( + args.rank, args.dist_url), flush=True) + dist.barrier() + setup_for_distributed(args.rank == 0) + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + maxk = max(topk) + batch_size = target.size(0) + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.reshape(1, -1).expand_as(pred)) + return [correct[:k].reshape(-1).float().sum(0) * 100. / batch_size for k in topk] + + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1. + math.erf(x / math.sqrt(2.))) / 2. + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): + # type: (Tensor, float, float, float, float) -> Tensor + return _no_grad_trunc_normal_(tensor, mean, std, a, b) + + +class LARS(torch.optim.Optimizer): + """ + Almost copy-paste from https://github.com/facebookresearch/barlowtwins/blob/main/main.py + """ + def __init__(self, params, lr=0, weight_decay=0, momentum=0.9, eta=0.001, + weight_decay_filter=None, lars_adaptation_filter=None): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, + eta=eta, weight_decay_filter=weight_decay_filter, + lars_adaptation_filter=lars_adaptation_filter) + super().__init__(params, defaults) + + @torch.no_grad() + def step(self): + for g in self.param_groups: + for p in g['params']: + dp = p.grad + + if dp is None: + continue + + if p.ndim != 1: + dp = dp.add(p, alpha=g['weight_decay']) + + if p.ndim != 1: + param_norm = torch.norm(p) + update_norm = torch.norm(dp) + one = torch.ones_like(param_norm) + q = torch.where(param_norm > 0., + torch.where(update_norm > 0, + (g['eta'] * param_norm / update_norm), one), one) + dp = dp.mul(q) + + param_state = self.state[p] + if 'mu' not in param_state: + param_state['mu'] = torch.zeros_like(p) + mu = param_state['mu'] + mu.mul_(g['momentum']).add_(dp) + + p.add_(mu, alpha=-g['lr']) + + +class MultiCropWrapper(nn.Module): + """ + Perform forward pass separately on each resolution input. + The inputs corresponding to a single resolution are clubbed and single + forward is run on the same resolution inputs. Hence we do several + forward passes = number of different resolutions used. We then + concatenate all the output features and run the head forward on these + concatenated features. + """ + def __init__(self, backbone, head): + super(MultiCropWrapper, self).__init__() + # disable layers dedicated to ImageNet labels classification + backbone.fc, backbone.head = nn.Identity(), nn.Identity() + self.backbone = backbone + self.head = head + + def forward(self, x): + # convert to list + if not isinstance(x, list): + x = [x] + idx_crops = torch.cumsum(torch.unique_consecutive( + torch.tensor([inp.shape[-1] for inp in x]), + return_counts=True, + )[1], 0) + start_idx, output = 0, torch.empty(0).to(x[0].device) + for end_idx in idx_crops: + _out = self.backbone(torch.cat(x[start_idx: end_idx])) + # The output is a tuple with XCiT model. See: + # https://github.com/facebookresearch/xcit/blob/master/xcit.py#L404-L405 + if isinstance(_out, tuple): + _out = _out[0] + # accumulate outputs + output = torch.cat((output, _out)) + start_idx = end_idx + # Run the head forward on the concatenated features. + return self.head(output) + + +def get_params_groups(model): + regularized = [] + not_regularized = [] + for name, param in model.named_parameters(): + if not param.requires_grad: + continue + # we do not regularize biases nor Norm parameters + if name.endswith(".bias") or len(param.shape) == 1: + not_regularized.append(param) + else: + regularized.append(param) + return [{'params': regularized}, {'params': not_regularized, 'weight_decay': 0.}] + + +def has_batchnorms(model): + bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm) + for name, module in model.named_modules(): + if isinstance(module, bn_types): + return True + return False + + +class PCA(): + """ + Class to compute and apply PCA. + """ + def __init__(self, dim=256, whit=0.5): + self.dim = dim + self.whit = whit + self.mean = None + + def train_pca(self, cov): + """ + Takes a covariance matrix (np.ndarray) as input. + """ + d, v = np.linalg.eigh(cov) + eps = d.max() * 1e-5 + n_0 = (d < eps).sum() + if n_0 > 0: + d[d < eps] = eps + + # total energy + totenergy = d.sum() + + # sort eigenvectors with eigenvalues order + idx = np.argsort(d)[::-1][:self.dim] + d = d[idx] + v = v[:, idx] + + print("keeping %.2f %% of the energy" % (d.sum() / totenergy * 100.0)) + + # for the whitening + d = np.diag(1. / d**self.whit) + + # principal components + self.dvt = np.dot(d, v.T) + + def apply(self, x): + # input is from numpy + if isinstance(x, np.ndarray): + if self.mean is not None: + x -= self.mean + return np.dot(self.dvt, x.T).T + + # input is from torch and is on GPU + if x.is_cuda: + if self.mean is not None: + x -= torch.cuda.FloatTensor(self.mean) + return torch.mm(torch.cuda.FloatTensor(self.dvt), x.transpose(0, 1)).transpose(0, 1) + + # input if from torch, on CPU + if self.mean is not None: + x -= torch.FloatTensor(self.mean) + return torch.mm(torch.FloatTensor(self.dvt), x.transpose(0, 1)).transpose(0, 1) + + +def compute_ap(ranks, nres): + """ + Computes average precision for given ranked indexes. + Arguments + --------- + ranks : zerro-based ranks of positive images + nres : number of positive images + Returns + ------- + ap : average precision + """ + + # number of images ranked by the system + nimgranks = len(ranks) + + # accumulate trapezoids in PR-plot + ap = 0 + + recall_step = 1. / nres + + for j in np.arange(nimgranks): + rank = ranks[j] + + if rank == 0: + precision_0 = 1. + else: + precision_0 = float(j) / rank + + precision_1 = float(j + 1) / (rank + 1) + + ap += (precision_0 + precision_1) * recall_step / 2. + + return ap + + +def compute_map(ranks, gnd, kappas=[]): + """ + Computes the mAP for a given set of returned results. + Usage: + map = compute_map (ranks, gnd) + computes mean average precsion (map) only + map, aps, pr, prs = compute_map (ranks, gnd, kappas) + computes mean average precision (map), average precision (aps) for each query + computes mean precision at kappas (pr), precision at kappas (prs) for each query + Notes: + 1) ranks starts from 0, ranks.shape = db_size X #queries + 2) The junk results (e.g., the query itself) should be declared in the gnd stuct array + 3) If there are no positive images for some query, that query is excluded from the evaluation + """ + + map = 0. + nq = len(gnd) # number of queries + aps = np.zeros(nq) + pr = np.zeros(len(kappas)) + prs = np.zeros((nq, len(kappas))) + nempty = 0 + + for i in np.arange(nq): + qgnd = np.array(gnd[i]['ok']) + + # no positive images, skip from the average + if qgnd.shape[0] == 0: + aps[i] = float('nan') + prs[i, :] = float('nan') + nempty += 1 + continue + + try: + qgndj = np.array(gnd[i]['junk']) + except: + qgndj = np.empty(0) + + # sorted positions of positive and junk images (0 based) + pos = np.arange(ranks.shape[0])[np.in1d(ranks[:,i], qgnd)] + junk = np.arange(ranks.shape[0])[np.in1d(ranks[:,i], qgndj)] + + k = 0; + ij = 0; + if len(junk): + # decrease positions of positives based on the number of + # junk images appearing before them + ip = 0 + while (ip < len(pos)): + while (ij < len(junk) and pos[ip] > junk[ij]): + k += 1 + ij += 1 + pos[ip] = pos[ip] - k + ip += 1 + + # compute ap + ap = compute_ap(pos, len(qgnd)) + map = map + ap + aps[i] = ap + + # compute precision @ k + pos += 1 # get it to 1-based + for j in np.arange(len(kappas)): + kq = min(max(pos), kappas[j]); + prs[i, j] = (pos <= kq).sum() / kq + pr = pr + prs[i, :] + + map = map / (nq - nempty) + pr = pr / (nq - nempty) + + return map, aps, pr, prs + + +def multi_scale(samples, model): + v = None + for s in [1, 1/2**(1/2), 1/2]: # we use 3 different scales + if s == 1: + inp = samples.clone() + else: + inp = nn.functional.interpolate(samples, scale_factor=s, mode='bilinear', align_corners=False) + feats = model(inp).clone() + if v is None: + v = feats + else: + v += feats + v /= 3 + v /= v.norm() + return v diff --git a/PuzzleTuning/Counterpart PreTrain Methods/dino-main/video_generation.py b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/video_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..94da9836ad0e9bd8dccf0f989b93a93ed11cfd7e --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/video_generation.py @@ -0,0 +1,378 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import glob +import sys +import argparse +import cv2 + +from tqdm import tqdm +import matplotlib.pyplot as plt +import torch +import torch.nn as nn +import torchvision +from torchvision import transforms as pth_transforms +import numpy as np +from PIL import Image + +import utils +import vision_transformer as vits + + +FOURCC = { + "mp4": cv2.VideoWriter_fourcc(*"MP4V"), + "avi": cv2.VideoWriter_fourcc(*"XVID"), +} +DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + + +class VideoGenerator: + def __init__(self, args): + self.args = args + # self.model = None + # Don't need to load model if you only want a video + if not self.args.video_only: + self.model = self.__load_model() + + def run(self): + if self.args.input_path is None: + print(f"Provided input path {self.args.input_path} is non valid.") + sys.exit(1) + else: + if self.args.video_only: + self._generate_video_from_images( + self.args.input_path, self.args.output_path + ) + else: + # If input path exists + if os.path.exists(self.args.input_path): + # If input is a video file + if os.path.isfile(self.args.input_path): + frames_folder = os.path.join(self.args.output_path, "frames") + attention_folder = os.path.join( + self.args.output_path, "attention" + ) + + os.makedirs(frames_folder, exist_ok=True) + os.makedirs(attention_folder, exist_ok=True) + + self._extract_frames_from_video( + self.args.input_path, frames_folder + ) + + self._inference( + frames_folder, + attention_folder, + ) + + self._generate_video_from_images( + attention_folder, self.args.output_path + ) + + # If input is a folder of already extracted frames + if os.path.isdir(self.args.input_path): + attention_folder = os.path.join( + self.args.output_path, "attention" + ) + + os.makedirs(attention_folder, exist_ok=True) + + self._inference(self.args.input_path, attention_folder) + + self._generate_video_from_images( + attention_folder, self.args.output_path + ) + + # If input path doesn't exists + else: + print(f"Provided input path {self.args.input_path} doesn't exists.") + sys.exit(1) + + def _extract_frames_from_video(self, inp: str, out: str): + vidcap = cv2.VideoCapture(inp) + self.args.fps = vidcap.get(cv2.CAP_PROP_FPS) + + print(f"Video: {inp} ({self.args.fps} fps)") + print(f"Extracting frames to {out}") + + success, image = vidcap.read() + count = 0 + while success: + cv2.imwrite( + os.path.join(out, f"frame-{count:04}.jpg"), + image, + ) + success, image = vidcap.read() + count += 1 + + def _generate_video_from_images(self, inp: str, out: str): + img_array = [] + attention_images_list = sorted(glob.glob(os.path.join(inp, "attn-*.jpg"))) + + # Get size of the first image + with open(attention_images_list[0], "rb") as f: + img = Image.open(f) + img = img.convert("RGB") + size = (img.width, img.height) + img_array.append(cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)) + + print(f"Generating video {size} to {out}") + + for filename in tqdm(attention_images_list[1:]): + with open(filename, "rb") as f: + img = Image.open(f) + img = img.convert("RGB") + img_array.append(cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)) + + out = cv2.VideoWriter( + os.path.join(out, "video." + self.args.video_format), + FOURCC[self.args.video_format], + self.args.fps, + size, + ) + + for i in range(len(img_array)): + out.write(img_array[i]) + out.release() + print("Done") + + def _inference(self, inp: str, out: str): + print(f"Generating attention images to {out}") + + for img_path in tqdm(sorted(glob.glob(os.path.join(inp, "*.jpg")))): + with open(img_path, "rb") as f: + img = Image.open(f) + img = img.convert("RGB") + + if self.args.resize is not None: + transform = pth_transforms.Compose( + [ + pth_transforms.ToTensor(), + pth_transforms.Resize(self.args.resize), + pth_transforms.Normalize( + (0.485, 0.456, 0.406), (0.229, 0.224, 0.225) + ), + ] + ) + else: + transform = pth_transforms.Compose( + [ + pth_transforms.ToTensor(), + pth_transforms.Normalize( + (0.485, 0.456, 0.406), (0.229, 0.224, 0.225) + ), + ] + ) + + img = transform(img) + + # make the image divisible by the patch size + w, h = ( + img.shape[1] - img.shape[1] % self.args.patch_size, + img.shape[2] - img.shape[2] % self.args.patch_size, + ) + img = img[:, :w, :h].unsqueeze(0) + + w_featmap = img.shape[-2] // self.args.patch_size + h_featmap = img.shape[-1] // self.args.patch_size + + attentions = self.model.get_last_selfattention(img.to(DEVICE)) + + nh = attentions.shape[1] # number of head + + # we keep only the output patch attention + attentions = attentions[0, :, 0, 1:].reshape(nh, -1) + + # we keep only a certain percentage of the mass + val, idx = torch.sort(attentions) + val /= torch.sum(val, dim=1, keepdim=True) + cumval = torch.cumsum(val, dim=1) + th_attn = cumval > (1 - self.args.threshold) + idx2 = torch.argsort(idx) + for head in range(nh): + th_attn[head] = th_attn[head][idx2[head]] + th_attn = th_attn.reshape(nh, w_featmap, h_featmap).float() + # interpolate + th_attn = ( + nn.functional.interpolate( + th_attn.unsqueeze(0), + scale_factor=self.args.patch_size, + mode="nearest", + )[0] + .cpu() + .numpy() + ) + + attentions = attentions.reshape(nh, w_featmap, h_featmap) + attentions = ( + nn.functional.interpolate( + attentions.unsqueeze(0), + scale_factor=self.args.patch_size, + mode="nearest", + )[0] + .cpu() + .numpy() + ) + + # save attentions heatmaps + fname = os.path.join(out, "attn-" + os.path.basename(img_path)) + plt.imsave( + fname=fname, + arr=sum( + attentions[i] * 1 / attentions.shape[0] + for i in range(attentions.shape[0]) + ), + cmap="inferno", + format="jpg", + ) + + def __load_model(self): + # build model + model = vits.__dict__[self.args.arch]( + patch_size=self.args.patch_size, num_classes=0 + ) + for p in model.parameters(): + p.requires_grad = False + model.eval() + model.to(DEVICE) + + if os.path.isfile(self.args.pretrained_weights): + state_dict = torch.load(self.args.pretrained_weights, map_location="cpu") + if ( + self.args.checkpoint_key is not None + and self.args.checkpoint_key in state_dict + ): + print( + f"Take key {self.args.checkpoint_key} in provided checkpoint dict" + ) + state_dict = state_dict[self.args.checkpoint_key] + state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} + # remove `backbone.` prefix induced by multicrop wrapper + state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()} + msg = model.load_state_dict(state_dict, strict=False) + print( + "Pretrained weights found at {} and loaded with msg: {}".format( + self.args.pretrained_weights, msg + ) + ) + else: + print( + "Please use the `--pretrained_weights` argument to indicate the path of the checkpoint to evaluate." + ) + url = None + if self.args.arch == "vit_small" and self.args.patch_size == 16: + url = "dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth" + elif self.args.arch == "vit_small" and self.args.patch_size == 8: + url = "dino_deitsmall8_300ep_pretrain/dino_deitsmall8_300ep_pretrain.pth" # model used for visualizations in our paper + elif self.args.arch == "vit_base" and self.args.patch_size == 16: + url = "dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth" + elif self.args.arch == "vit_base" and self.args.patch_size == 8: + url = "dino_vitbase8_pretrain/dino_vitbase8_pretrain.pth" + if url is not None: + print( + "Since no pretrained weights have been provided, we load the reference pretrained DINO weights." + ) + state_dict = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/dino/" + url + ) + model.load_state_dict(state_dict, strict=True) + else: + print( + "There is no reference weights available for this model => We use random weights." + ) + return model + + +def parse_args(): + parser = argparse.ArgumentParser("Generation self-attention video") + parser.add_argument( + "--arch", + default="vit_small", + type=str, + choices=["vit_tiny", "vit_small", "vit_base"], + help="Architecture (support only ViT atm).", + ) + parser.add_argument( + "--patch_size", default=8, type=int, help="Patch resolution of the self.model." + ) + parser.add_argument( + "--pretrained_weights", + default="", + type=str, + help="Path to pretrained weights to load.", + ) + parser.add_argument( + "--checkpoint_key", + default="teacher", + type=str, + help='Key to use in the checkpoint (example: "teacher")', + ) + parser.add_argument( + "--input_path", + required=True, + type=str, + help="""Path to a video file if you want to extract frames + or to a folder of images already extracted by yourself. + or to a folder of attention images.""", + ) + parser.add_argument( + "--output_path", + default="./", + type=str, + help="""Path to store a folder of frames and / or a folder of attention images. + and / or a final video. Default to current directory.""", + ) + parser.add_argument( + "--threshold", + type=float, + default=0.6, + help="""We visualize masks + obtained by thresholding the self-attention maps to keep xx percent of the mass.""", + ) + parser.add_argument( + "--resize", + default=None, + type=int, + nargs="+", + help="""Apply a resize transformation to input image(s). Use if OOM error. + Usage (single or W H): --resize 512, --resize 720 1280""", + ) + parser.add_argument( + "--video_only", + action="store_true", + help="""Use this flag if you only want to generate a video and not all attention images. + If used, --input_path must be set to the folder of attention images. Ex: ./attention/""", + ) + parser.add_argument( + "--fps", + default=30.0, + type=float, + help="FPS of input / output video. Automatically set if you extract frames from a video.", + ) + parser.add_argument( + "--video_format", + default="mp4", + type=str, + choices=["mp4", "avi"], + help="Format of generated video (mp4 or avi).", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + + vg = VideoGenerator(args) + vg.run() diff --git a/PuzzleTuning/Counterpart PreTrain Methods/dino-main/vision_transformer.py b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/vision_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..f69a7ad0522500ca2a85305a789be5ca6ac474d0 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/vision_transformer.py @@ -0,0 +1,291 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Mostly copy-paste from timm library. +https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py +""" +import math +from functools import partial + +import torch +import torch.nn as nn + +from utils import trunc_normal_ + + +def drop_path(x, drop_prob: float = 0., training: bool = False): + if drop_prob == 0. or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) + random_tensor.floor_() # binarize + output = x.div(keep_prob) * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x, attn + + +class Block(nn.Module): + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x, return_attention=False): + y, attn = self.attn(self.norm1(x)) + if return_attention: + return attn + x = x + self.drop_path(y) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + num_patches = (img_size // patch_size) * (img_size // patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + B, C, H, W = x.shape + x = self.proj(x).flatten(2).transpose(1, 2) + return x + + +class VisionTransformer(nn.Module): + """ Vision Transformer """ + def __init__(self, img_size=[224], patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=nn.LayerNorm, **kwargs): + super().__init__() + self.num_features = self.embed_dim = embed_dim + + self.patch_embed = PatchEmbed( + img_size=img_size[0], patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) + + # Classifier head + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def interpolate_pos_encoding(self, x, w, h): + npatch = x.shape[1] - 1 + N = self.pos_embed.shape[1] - 1 + if npatch == N and w == h: + return self.pos_embed + class_pos_embed = self.pos_embed[:, 0] + patch_pos_embed = self.pos_embed[:, 1:] + dim = x.shape[-1] + w0 = w // self.patch_embed.patch_size + h0 = h // self.patch_embed.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + w0, h0 = w0 + 0.1, h0 + 0.1 + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2), + scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)), + mode='bicubic', + ) + assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1] + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def prepare_tokens(self, x): + B, nc, w, h = x.shape + x = self.patch_embed(x) # patch linear embedding + + # add the [CLS] token to the embed patch tokens + cls_tokens = self.cls_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + + # add positional encoding to each token + x = x + self.interpolate_pos_encoding(x, w, h) + + return self.pos_drop(x) + + def forward(self, x): + x = self.prepare_tokens(x) + for blk in self.blocks: + x = blk(x) + x = self.norm(x) + return x[:, 0] + + def get_last_selfattention(self, x): + x = self.prepare_tokens(x) + for i, blk in enumerate(self.blocks): + if i < len(self.blocks) - 1: + x = blk(x) + else: + # return attention of the last block + return blk(x, return_attention=True) + + def get_intermediate_layers(self, x, n=1): + x = self.prepare_tokens(x) + # we return the output tokens from the `n` last blocks + output = [] + for i, blk in enumerate(self.blocks): + x = blk(x) + if len(self.blocks) - i <= n: + output.append(self.norm(x)) + return output + + +def vit_tiny(patch_size=16, **kwargs): + model = VisionTransformer( + patch_size=patch_size, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4, + qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def vit_small(patch_size=16, **kwargs): + model = VisionTransformer( + patch_size=patch_size, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4, + qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def vit_base(patch_size=16, **kwargs): + model = VisionTransformer( + patch_size=patch_size, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, + qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +class DINOHead(nn.Module): + def __init__(self, in_dim, out_dim, use_bn=False, norm_last_layer=True, nlayers=3, hidden_dim=2048, bottleneck_dim=256): + super().__init__() + nlayers = max(nlayers, 1) + if nlayers == 1: + self.mlp = nn.Linear(in_dim, bottleneck_dim) + else: + layers = [nn.Linear(in_dim, hidden_dim)] + if use_bn: + layers.append(nn.BatchNorm1d(hidden_dim)) + layers.append(nn.GELU()) + for _ in range(nlayers - 2): + layers.append(nn.Linear(hidden_dim, hidden_dim)) + if use_bn: + layers.append(nn.BatchNorm1d(hidden_dim)) + layers.append(nn.GELU()) + layers.append(nn.Linear(hidden_dim, bottleneck_dim)) + self.mlp = nn.Sequential(*layers) + self.apply(self._init_weights) + self.last_layer = nn.utils.weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False)) + self.last_layer.weight_g.data.fill_(1) + if norm_last_layer: + self.last_layer.weight_g.requires_grad = False + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.mlp(x) + x = nn.functional.normalize(x, dim=-1, p=2) + x = self.last_layer(x) + return x diff --git a/PuzzleTuning/Counterpart PreTrain Methods/dino-main/visualize_attention.py b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/visualize_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..4288265b9b8865bebfcaad1d350a114da35ff055 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/dino-main/visualize_attention.py @@ -0,0 +1,213 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +import argparse +import cv2 +import random +import colorsys +import requests +from io import BytesIO + +import skimage.io +from skimage.measure import find_contours +import matplotlib.pyplot as plt +from matplotlib.patches import Polygon +import torch +import torch.nn as nn +import torchvision +from torchvision import transforms as pth_transforms +import numpy as np +from PIL import Image + +import utils +import vision_transformer as vits + + +def apply_mask(image, mask, color, alpha=0.5): + for c in range(3): + image[:, :, c] = image[:, :, c] * (1 - alpha * mask) + alpha * mask * color[c] * 255 + return image + + +def random_colors(N, bright=True): + """ + Generate random colors. + """ + brightness = 1.0 if bright else 0.7 + hsv = [(i / N, 1, brightness) for i in range(N)] + colors = list(map(lambda c: colorsys.hsv_to_rgb(*c), hsv)) + random.shuffle(colors) + return colors + + +def display_instances(image, mask, fname="test", figsize=(5, 5), blur=False, contour=True, alpha=0.5): + fig = plt.figure(figsize=figsize, frameon=False) + ax = plt.Axes(fig, [0., 0., 1., 1.]) + ax.set_axis_off() + fig.add_axes(ax) + ax = plt.gca() + + N = 1 + mask = mask[None, :, :] + # Generate random colors + colors = random_colors(N) + + # Show area outside image boundaries. + height, width = image.shape[:2] + margin = 0 + ax.set_ylim(height + margin, -margin) + ax.set_xlim(-margin, width + margin) + ax.axis('off') + masked_image = image.astype(np.uint32).copy() + for i in range(N): + color = colors[i] + _mask = mask[i] + if blur: + _mask = cv2.blur(_mask,(10,10)) + # Mask + masked_image = apply_mask(masked_image, _mask, color, alpha) + # Mask Polygon + # Pad to ensure proper polygons for masks that touch image edges. + if contour: + padded_mask = np.zeros((_mask.shape[0] + 2, _mask.shape[1] + 2)) + padded_mask[1:-1, 1:-1] = _mask + contours = find_contours(padded_mask, 0.5) + for verts in contours: + # Subtract the padding and flip (y, x) to (x, y) + verts = np.fliplr(verts) - 1 + p = Polygon(verts, facecolor="none", edgecolor=color) + ax.add_patch(p) + ax.imshow(masked_image.astype(np.uint8), aspect='auto') + fig.savefig(fname) + print(f"{fname} saved.") + return + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('Visualize Self-Attention maps') + parser.add_argument('--arch', default='vit_small', type=str, + choices=['vit_tiny', 'vit_small', 'vit_base'], help='Architecture (support only ViT atm).') + parser.add_argument('--patch_size', default=8, type=int, help='Patch resolution of the model.') + parser.add_argument('--pretrained_weights', default='', type=str, + help="Path to pretrained weights to load.") + parser.add_argument("--checkpoint_key", default="teacher", type=str, + help='Key to use in the checkpoint (example: "teacher")') + parser.add_argument("--image_path", default=None, type=str, help="Path of the image to load.") + parser.add_argument("--image_size", default=(480, 480), type=int, nargs="+", help="Resize image.") + parser.add_argument('--output_dir', default='.', help='Path where to save visualizations.') + parser.add_argument("--threshold", type=float, default=None, help="""We visualize masks + obtained by thresholding the self-attention maps to keep xx% of the mass.""") + args = parser.parse_args() + + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + # build model + model = vits.__dict__[args.arch](patch_size=args.patch_size, num_classes=0) + for p in model.parameters(): + p.requires_grad = False + model.eval() + model.to(device) + if os.path.isfile(args.pretrained_weights): + state_dict = torch.load(args.pretrained_weights, map_location="cpu") + if args.checkpoint_key is not None and args.checkpoint_key in state_dict: + print(f"Take key {args.checkpoint_key} in provided checkpoint dict") + state_dict = state_dict[args.checkpoint_key] + # remove `module.` prefix + state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} + # remove `backbone.` prefix induced by multicrop wrapper + state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()} + msg = model.load_state_dict(state_dict, strict=False) + print('Pretrained weights found at {} and loaded with msg: {}'.format(args.pretrained_weights, msg)) + else: + print("Please use the `--pretrained_weights` argument to indicate the path of the checkpoint to evaluate.") + url = None + if args.arch == "vit_small" and args.patch_size == 16: + url = "dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth" + elif args.arch == "vit_small" and args.patch_size == 8: + url = "dino_deitsmall8_300ep_pretrain/dino_deitsmall8_300ep_pretrain.pth" # model used for visualizations in our paper + elif args.arch == "vit_base" and args.patch_size == 16: + url = "dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth" + elif args.arch == "vit_base" and args.patch_size == 8: + url = "dino_vitbase8_pretrain/dino_vitbase8_pretrain.pth" + if url is not None: + print("Since no pretrained weights have been provided, we load the reference pretrained DINO weights.") + state_dict = torch.hub.load_state_dict_from_url(url="https://dl.fbaipublicfiles.com/dino/" + url) + model.load_state_dict(state_dict, strict=True) + else: + print("There is no reference weights available for this model => We use random weights.") + + # open image + if args.image_path is None: + # user has not specified any image - we use our own image + print("Please use the `--image_path` argument to indicate the path of the image you wish to visualize.") + print("Since no image path have been provided, we take the first image in our paper.") + response = requests.get("https://dl.fbaipublicfiles.com/dino/img.png") + img = Image.open(BytesIO(response.content)) + img = img.convert('RGB') + elif os.path.isfile(args.image_path): + with open(args.image_path, 'rb') as f: + img = Image.open(f) + img = img.convert('RGB') + else: + print(f"Provided image path {args.image_path} is non valid.") + sys.exit(1) + transform = pth_transforms.Compose([ + pth_transforms.Resize(args.image_size), + pth_transforms.ToTensor(), + pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) + img = transform(img) + + # make the image divisible by the patch size + w, h = img.shape[1] - img.shape[1] % args.patch_size, img.shape[2] - img.shape[2] % args.patch_size + img = img[:, :w, :h].unsqueeze(0) + + w_featmap = img.shape[-2] // args.patch_size + h_featmap = img.shape[-1] // args.patch_size + + attentions = model.get_last_selfattention(img.to(device)) + + nh = attentions.shape[1] # number of head + + # we keep only the output patch attention + attentions = attentions[0, :, 0, 1:].reshape(nh, -1) + + if args.threshold is not None: + # we keep only a certain percentage of the mass + val, idx = torch.sort(attentions) + val /= torch.sum(val, dim=1, keepdim=True) + cumval = torch.cumsum(val, dim=1) + th_attn = cumval > (1 - args.threshold) + idx2 = torch.argsort(idx) + for head in range(nh): + th_attn[head] = th_attn[head][idx2[head]] + th_attn = th_attn.reshape(nh, w_featmap, h_featmap).float() + # interpolate + th_attn = nn.functional.interpolate(th_attn.unsqueeze(0), scale_factor=args.patch_size, mode="nearest")[0].cpu().numpy() + + attentions = attentions.reshape(nh, w_featmap, h_featmap) + attentions = nn.functional.interpolate(attentions.unsqueeze(0), scale_factor=args.patch_size, mode="nearest")[0].cpu().numpy() + + # save attentions heatmaps + os.makedirs(args.output_dir, exist_ok=True) + torchvision.utils.save_image(torchvision.utils.make_grid(img, normalize=True, scale_each=True), os.path.join(args.output_dir, "img.png")) + for j in range(nh): + fname = os.path.join(args.output_dir, "attn-head" + str(j) + ".png") + plt.imsave(fname=fname, arr=attentions[j], format='png') + print(f"{fname} saved.") + + if args.threshold is not None: + image = skimage.io.imread(os.path.join(args.output_dir, "img.png")) + for j in range(nh): + display_instances(image, th_attn[j], fname=os.path.join(args.output_dir, "mask_th" + str(args.threshold) + "_head" + str(j) +".png"), blur=False) diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/FINETUNE.md b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/FINETUNE.md new file mode 100644 index 0000000000000000000000000000000000000000..387b10df020c82914903fde13af857e577cb6f27 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/FINETUNE.md @@ -0,0 +1,13 @@ +## Fine tune GCMAE + +``` +python main_finetune.py \ + --data_path path/to/data \ + --nb_classes 9 \ + --output_dir path/to/ouput/dir \ + --log_dir path/to/log/dir \ + --batch_size 128 \ + --model vit_base_patch16 \ + --epochs 50 \ + --finetune path/to/pth/path \ +``` diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/GCMAE.png b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/GCMAE.png new file mode 100644 index 0000000000000000000000000000000000000000..1b49c8db9e27862d0414f87aab517766ecc5ace9 Binary files /dev/null and b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/GCMAE.png differ diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/LICENSE b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..0ff744738f193e55a120e2ee6bcc1b4fdace0dd2 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/LICENSE @@ -0,0 +1,399 @@ +Attribution-NonCommercial 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More_considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution-NonCommercial 4.0 International Public +License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution-NonCommercial 4.0 International Public License ("Public +License"). To the extent this Public License may be interpreted as a +contract, You are granted the Licensed Rights in consideration of Your +acceptance of these terms and conditions, and the Licensor grants You +such rights in consideration of benefits the Licensor receives from +making the Licensed Material available under these terms and +conditions. + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + d. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + e. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + f. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + g. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + h. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + i. NonCommercial means not primarily intended for or directed towards + commercial advantage or monetary compensation. For purposes of + this Public License, the exchange of the Licensed Material for + other material subject to Copyright and Similar Rights by digital + file-sharing or similar means is NonCommercial provided there is + no payment of monetary compensation in connection with the + exchange. + + j. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + k. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + l. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part, for NonCommercial purposes only; and + + b. produce, reproduce, and Share Adapted Material for + NonCommercial purposes only. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties, including when + the Licensed Material is used other than for NonCommercial + purposes. + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + 4. If You Share Adapted Material You produce, the Adapter's + License You apply must not prevent recipients of the Adapted + Material from complying with this Public License. + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database for NonCommercial purposes + only; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material; and + + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/LINPROBE.md b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/LINPROBE.md new file mode 100644 index 0000000000000000000000000000000000000000..1cfc347057628562be6637111ee127c28235d4ae --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/LINPROBE.md @@ -0,0 +1,15 @@ +## Linear probe GCMAE + +``` +python main_linprobe.py \ + --data_path_train path/to/train/data \ + --data_path_val path/to/val/data \ + --nb_classes 2 \ + --output_dir path/to/ouput/dir \ + --log_dir path/to/log/dir \ + --batch_size 512 \ + --model vit_base_patch16 \ + --epochs 90 \ + --finetune path/to/pth/path +``` + diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/PRETRAIN.md b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/PRETRAIN.md new file mode 100644 index 0000000000000000000000000000000000000000..ef3e5b89f499461d4208b89a5cfafcfe298c0b2c --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/PRETRAIN.md @@ -0,0 +1,21 @@ +## Pre-training GCMAE + +To pre-train ViT-Base (recommended default) +``` +python main_pretrain.py \ + --data_path path/to/data \ + --data_val_path path/to/data \ + --output_dir path/to/ouput/dir \ + --log_dir path/to/log/dir \ + --batch_size 128 \ + --model gcmae_vit_base_patch16 \ + --norm_pix_loss \ + --mask_ratio 0.5 \ + --epochs 80 \ + --warmup_epochs 40 \ + --blr 1e-3 --weight_decay 0.05 \ + --low_dim 768 \ + --nce_k 8192 \ + --nce_t 0.07 \ + --nce_m 0.5 \ +``` \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/README.md b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a708663eeb22ddb1c096d6e026d1fbfc63dac468 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/README.md @@ -0,0 +1,15 @@ +# GCMAE + +The original repo of GCMAE could be found [here](https://github.com/StarUniversus/gcmae) + +To install environments: +```bash +pip install -r requirements.txt +``` + + +To start pretraining: +```bash +# You need to alter the script according to your directories +bash pretrain.sh +``` diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/README_origin.md b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/README_origin.md new file mode 100644 index 0000000000000000000000000000000000000000..1e112b24eb838119a25ee8877ce2186a3d63bbd2 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/README_origin.md @@ -0,0 +1,39 @@ +# GCMAE + +

+ +

+ +The official implementation of the paper [Global Contrast Masked Autoencoders Are Powerful Pathological Representation Learners](https://arxiv.org/abs/2205.09048) + +``` +@article{li2022gcmae, + author = {Quan, Hao and Li, Xingyu and Chen, Weixing and Bai, Qun and Zou, Mingchen and Yang, Ruijie and Zheng, Tingting and Qi, Ruiqun and Gao, Xinghua and Cui, Xiaoyu}, + title = {Global Contrast Masked Autoencoders Are Powerful Pathological Representation Learners}, + journal={arXiv:2205.09048}, + year = {2022}, +} +``` +## Abstract +Based on digital whole slide scanning technique, artificial intelligence algorithms represented by deep learning have achieved remarkable results in the field of computational pathology. Compared with other medical images such as Computed Tomography (CT) or Magnetic Resonance Imaging (MRI), pathological images are more difficult to annotate, thus there is an extreme lack of data sets that can be used for supervised learning. In this study, a self-supervised learning (SSL) model, Global Contrast Masked Autoencoders (GCMAE), is proposed, which has the ability to represent both global and local domain-specific features of whole slide image (WSI), as well as excellent cross-data transfer ability. The Camelyon16 and NCTCRC datasets are used to evaluate the performance of our model. When dealing with transfer learning tasks with different data sets, the experimental results show that GCMAE has better linear classification accuracy than MAE, which can reach **81.10%** and **89.22%** respectively. Our method outperforms the previous state of-the-art algorithm and even surpass supervised learning (improved by **3.86%** on NCTCRC data sets). + +## Installation +This repo is a modification on the [mae repo](https://github.com/facebookresearch/mae). Installation and preparation follow that repo. + +## Usage + +* [PRETRAIN](PRETRAIN.md) + +* [LINPROBE](LINPROBE.md) + +* [FINETUNE](FINETUNE.md) + +* [ Visual GCMAE feature representation](VISUAL.md) + +## Dataset + * [Camelyon16](https://pan.baidu.com/s/1N0fqJR9u8yq-y6ZY0mSoUw?pwd=noms) + * [NCT-CRC-HE-100K](https://zenodo.org/record/1214456) + * [BreakHis](https://web.inf.ufpr.br/vri/databases/breast-cancer-histopathological-database-breakhis/) + +## License +Distributed under the CC-BY-NC 4.0 License. See [LICENSE](LICENSE) for more information. diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/VISUAL.md b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/VISUAL.md new file mode 100644 index 0000000000000000000000000000000000000000..60ddf39b5439137bd954800bb17d1260f0159f58 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/VISUAL.md @@ -0,0 +1,10 @@ +## Visual GCMAE feature representation + +``` +python tsne.py \ + --batch_size 128 \ + --model vit_base_patch16 \ + --finetune path/to/pth \ + --save_path path/to/png \ + --data_path_val path/to/data \ +``` \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/engine_finetune.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/engine_finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..a838164a11c1e49ad3dfeee932208724bfc99611 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/engine_finetune.py @@ -0,0 +1,183 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# DeiT: https://github.com/facebookresearch/deit +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# MAE: https://github.com/facebookresearch/mae +# -------------------------------------------------------- + +import math +import sys +from typing import Iterable, Optional + +import torch + +from timm.data import Mixup +from timm.utils import accuracy + +import util.misc as misc +import util.lr_sched as lr_sched +from sklearn.metrics import roc_auc_score +from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix +import torch.nn.functional as F +import numpy +import numpy as np +from torchmetrics import Specificity, AUROC + + + +def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, + data_loader: Iterable, optimizer: torch.optim.Optimizer, + device: torch.device, epoch: int, loss_scaler, max_norm: float = 0, + mixup_fn: Optional[Mixup] = None, log_writer=None, + args=None): + model.train(True) + metric_logger = misc.MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}')) + header = 'Epoch: [{}]'.format(epoch) + print_freq = 20 + + accum_iter = args.accum_iter + + optimizer.zero_grad() + + if log_writer is not None: + print('log_dir: {}'.format(log_writer.log_dir)) + + for data_iter_step, (samples, targets) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): + + # we use a per iteration (instead of per epoch) lr scheduler + if data_iter_step % accum_iter == 0: + lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args) + + samples = samples.to(device, non_blocking=True) + targets = targets.to(device, non_blocking=True) + + if mixup_fn is not None: + samples, targets = mixup_fn(samples, targets) + + with torch.cuda.amp.autocast(): + outputs = model(samples) + loss = criterion(outputs, targets) + + loss_value = loss.item() + + if not math.isfinite(loss_value): + print("Loss is {}, stopping training".format(loss_value)) + sys.exit(1) + + loss /= accum_iter + loss_scaler(loss, optimizer, clip_grad=max_norm, + parameters=model.parameters(), create_graph=False, + update_grad=(data_iter_step + 1) % accum_iter == 0) + if (data_iter_step + 1) % accum_iter == 0: + optimizer.zero_grad() + + torch.cuda.synchronize() + + metric_logger.update(loss=loss_value) + min_lr = 10. + max_lr = 0. + for group in optimizer.param_groups: + min_lr = min(min_lr, group["lr"]) + max_lr = max(max_lr, group["lr"]) + + metric_logger.update(lr=max_lr) + + loss_value_reduce = misc.all_reduce_mean(loss_value) + if log_writer is not None and (data_iter_step + 1) % accum_iter == 0: + """ We use epoch_1000x as the x-axis in tensorboard. + This calibrates different curves when batch size changes. + """ + epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000) + log_writer.add_scalar('loss', loss_value_reduce, epoch_1000x) + log_writer.add_scalar('lr', max_lr, epoch_1000x) + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} + +def confusion_m(y_true, y_pred): + tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() + return tn, fp, fn, tp + +def compute_metrics_binary(probs, preds, targets): + auc = roc_auc_score(targets, probs) * 100 + precision = precision_score(targets, preds) * 100 + recall = recall_score(targets, preds) * 100 + f1 = f1_score(targets, preds) * 100 + tn, fp, fn, tp = confusion_m(targets, preds) + specificity = (tn / float(tn+fp)) * 100 + return auc, precision, recall, f1, specificity + +def compute_metrics_multiclass(probs, preds, targets, nb_classes): + preds_tensor, probs_tensor, targets_tensor = torch.tensor(preds), torch.tensor(probs), torch.tensor(targets) + auroc = AUROC(average='macro', num_classes=nb_classes) + auc = auroc(probs_tensor, targets_tensor) * 100 + precision = precision_score(targets, preds, average='macro') * 100 + recall = recall_score(targets, preds, average='macro') * 100 + f1 = f1_score(targets, preds, average='macro') * 100 + speci = Specificity(average='macro', num_classes=nb_classes) + specificity = speci(preds_tensor, targets_tensor) * 100 + return auc, precision, recall, f1, specificity + +@torch.no_grad() +def evaluate(data_loader, model, device, nb_classes): + criterion = torch.nn.CrossEntropyLoss() + m = torch.nn.Softmax(dim=1) + metric_logger = misc.MetricLogger(delimiter=" ") + header = 'Test:' + + # switch to evaluation mode + model.eval() + probs = [] + targets = [] + preds = [] + + for batch in metric_logger.log_every(data_loader, 10, header): + images = batch[0] + target = batch[-1] + images = images.to(device, non_blocking=True) + target = target.to(device, non_blocking=True) + + # compute output + with torch.cuda.amp.autocast(): + output = model(images) + loss = criterion(output, target) + output = m(output) + score, pred = output.topk(1, 1, True, True) + if nb_classes == 2: + prob = output[:, 1] + elif nb_classes > 2: + prob = output + + probs.extend(prob.detach().cpu().numpy()) + targets.extend(target.detach().cpu().numpy()) + preds.extend(pred.tolist()) + if nb_classes < 5: + acc1, acc5 = accuracy(output, target, topk=(1, 1)) + else: + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + + batch_size = images.shape[0] + metric_logger.update(loss=loss.item()) + metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) + metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}' + .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss)) + if nb_classes == 2: + print("binary class metrics!") + auc, precision, recall, f1, specificity = compute_metrics_binary(probs, preds, targets) + + elif nb_classes > 2: + print("multi_class metrics!") + auc, precision, recall, f1, specificity = compute_metrics_multiclass(probs, preds, targets, nb_classes) + + return {k: meter.global_avg for k, meter in metric_logger.meters.items()}, auc, precision, recall, f1, specificity \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/engine_pretrain.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/engine_pretrain.py new file mode 100644 index 0000000000000000000000000000000000000000..5203e79127d63c12e63b1a3e7ec60602f80fc915 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/engine_pretrain.py @@ -0,0 +1,99 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# DeiT: https://github.com/facebookresearch/deit +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- +import math +import sys +from typing import Iterable + +import torch + +import util.misc as misc +import util.lr_sched as lr_sched +from test_npid import NN, kNN + +def train_one_epoch(model: torch.nn.Module, + data_loader: Iterable, + optimizer: torch.optim.Optimizer, + device: torch.device, epoch: int, loss_scaler, + log_writer=None, + args=None, + lemniscate=None, + ): + model.train(True) + metric_logger = misc.MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}')) + header = 'Epoch: [{}]'.format(epoch) + print_freq = 20 + + accum_iter = args.accum_iter + + optimizer.zero_grad() + + if log_writer is not None: + print('log_dir: {}'.format(log_writer.log_dir)) + + for data_iter_step, (samples, _, index) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): + + # we use a per iteration (instead of per epoch) lr scheduler + if data_iter_step % accum_iter == 0: + lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args) + + samples = samples.to(device, non_blocking=True) + index = index.to(device, non_blocking=True) + with torch.cuda.amp.autocast(): + loss_mae, _, _, loss_npid, _= model(samples, mask_ratio=args.mask_ratio, index = index, is_train=True) + loss = loss_mae + 0.1 * loss_npid + + loss_value = loss.item() + + if not math.isfinite(loss_value): + print("Loss is {}, stopping training".format(loss_value)) + sys.exit(1) + + loss /= accum_iter + loss_scaler(loss, optimizer, parameters=model.parameters(), + update_grad=(data_iter_step + 1) % accum_iter == 0) + if (data_iter_step + 1) % accum_iter == 0: + optimizer.zero_grad() + + torch.cuda.synchronize() + + + metric_logger.update(loss_all=loss_value) + metric_logger.update(loss_mae=loss_mae.item()) + metric_logger.update(loss_npid=loss_npid.item()) + + + lr = optimizer.param_groups[0]["lr"] + metric_logger.update(lr=lr) + + loss_value_reduce = misc.all_reduce_mean(loss_value) + if log_writer is not None and (data_iter_step + 1) % accum_iter == 0: + """ We use epoch_1000x as the x-axis in tensorboard. + This calibrates different curves when batch size changes. + """ + + epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000) + log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x) + log_writer.add_scalar('origin_loss/train_loss_mae', loss_mae.item(), epoch_1000x) + log_writer.add_scalar('origin_loss/train_loss_npid', loss_npid.item(), epoch_1000x) + log_writer.add_scalar('lr', lr, epoch_1000x) + + # pred1 = NN(epoch, model, lemniscate, data_loader, data_loader_val) + # log_writer.add_scalar('NN_ac', pred1, epoch) + # if args.output_dir and (epoch % 20 == 0 or epoch + 1 == args.epochs): + + # top1 = kNN(0, model, lemniscate, data_loader, data_loader_val, 200, args.nce_t) + # log_writer.add_scalar('KNN_top1', top1, epoch) + + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + # return {k: meter.global_avg for k, meter in metric_logger.meters.items()}, pred1 + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/NCEAverage.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/NCEAverage.py new file mode 100644 index 0000000000000000000000000000000000000000..20978a4003c2c145189e53a9f3463b9770b771de --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/NCEAverage.py @@ -0,0 +1,94 @@ +import torch +from torch.autograd import Function +from torch import nn +from .alias_multinomial import AliasMethod +import math + +class NCEFunction(Function): + @staticmethod + def forward(self, x, y, memory, idx, params): + K = int(params[0].item()) + T = params[1].item() + Z = params[2].item() + + momentum = params[3].item() + batchSize = x.size(0) + outputSize = memory.size(0) + inputSize = memory.size(1) + + # sample positives & negatives + idx.select(1,0).copy_(y.detach()) + + # sample correspoinding weights + weight = torch.index_select(memory, 0, idx.view(-1)) + weight.resize_(batchSize, K+1, inputSize) + + # inner product + with torch.cuda.amp.autocast(enabled = False): + out = torch.bmm(weight, x.detach().reshape(batchSize, inputSize, 1)) + #print(out) + out.div_(T).exp_() # batchSize * self.K+1 + #x.detach().resize_(batchSize, inputSize) + #print(out) + if Z < 0: + params[2] = out.mean() * outputSize + Z = params[2].item() + print("normalization constant Z is set to {:.1f}".format(Z)) + + out.div_(Z).resize_(batchSize, K+1) + + self.save_for_backward(x, memory, y, weight, out, params) + + return out + + @staticmethod + def backward(self, gradOutput): + x, memory, y, weight, out, params = self.saved_tensors + K = int(params[0].item()) + T = params[1].item() + Z = params[2].item() + momentum = params[3].item() + batchSize = gradOutput.size(0) + + # gradients d Pm / d linear = exp(linear) / Z + gradOutput.detach().mul_(out.detach()) + # add temperature + gradOutput.detach().div_(T) + + + + # gradient of linear + with torch.cuda.amp.autocast(enabled = False): + gradInput = torch.bmm(gradOutput.detach().reshape(batchSize, 1, K+1), weight) + gradInput.resize_as_(x) + + # update the non-parametric detach() + weight_pos = weight.select(1, 0).resize_as_(x) + weight_pos.mul_(momentum) + weight_pos.add_(torch.mul(x.detach(), 1-momentum)) + w_norm = weight_pos.pow(2).sum(1, keepdim=True).pow(0.5) + updated_weight = weight_pos.div(w_norm) + memory.index_copy_(0, y, updated_weight) + + return gradInput, None, None, None, None + +class NCEAverage(nn.Module): + + def __init__(self, inputSize, outputSize, K, T=0.07, momentum=0.5, Z=None): + super(NCEAverage, self).__init__() + self.nLem = outputSize + self.unigrams = torch.ones(self.nLem) + self.multinomial = AliasMethod(self.unigrams) + self.multinomial.cuda() + self.K = K + + self.register_buffer('params',torch.tensor([K, T, -1, momentum])) + stdv = 1. / math.sqrt(inputSize/3) + self.register_buffer('memory', torch.rand(outputSize, inputSize).mul_(2*stdv).add_(-stdv)) + + def forward(self, x, y): + batchSize = x.size(0) + idx = self.multinomial.draw(batchSize * (self.K+1)).view(batchSize, -1) + out = NCEFunction.apply(x, y, self.memory, idx, self.params) + return out + diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/NCECriterion.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/NCECriterion.py new file mode 100644 index 0000000000000000000000000000000000000000..1fcd33441e7917f82059d20da77625b2052475da --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/NCECriterion.py @@ -0,0 +1,38 @@ +import torch +from torch import nn + +eps = 1e-7 + +class NCECriterion(nn.Module): + + def __init__(self, nLem): + super(NCECriterion, self).__init__() + self.nLem = nLem + + def forward(self, x, targets): + batchSize = x.size(0) + K = x.size(1)-1 + Pnt = 1 / float(self.nLem) + Pns = 1 / float(self.nLem) + + # eq 5.1 : P(origin=model) = Pmt / (Pmt + k*Pnt) + Pmt = x.select(1,0) + Pmt_div = Pmt.add(K * Pnt + eps) + lnPmt = torch.div(Pmt, Pmt_div) + + # eq 5.2 : P(origin=noise) = k*Pns / (Pms + k*Pns) + Pon_div = x.narrow(1,1,K).add(K * Pns + eps) + Pon = Pon_div.clone().fill_(K * Pns) + lnPon = torch.div(Pon, Pon_div) + + # equation 6 in ref. A + lnPmt.log_() + lnPon.log_() + + lnPmtsum = lnPmt.sum(0) + lnPonsum = lnPon.view(-1, 1).sum(0) + + loss = - (lnPmtsum + lnPonsum) / batchSize + + return loss + diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__init__.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..556df42ea177f37fd5e5497c14eae9e17f9b8406 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__init__.py @@ -0,0 +1 @@ +# nothing diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/NCEAverage.cpython-38.pyc b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/NCEAverage.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8617f136507d4e4bcaa2f50895348f1f4efe5ad6 Binary files /dev/null and b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/NCEAverage.cpython-38.pyc differ diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/NCECriterion.cpython-38.pyc b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/NCECriterion.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ee9ccef395b7aea97b1db1d41ecec0dde54a8c4 Binary files /dev/null and b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/NCECriterion.cpython-38.pyc differ diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/__init__.cpython-38.pyc b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9716f8fa40858a99a675881da23dc194af9150db Binary files /dev/null and b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/__init__.cpython-38.pyc differ diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/alias_multinomial.cpython-38.pyc b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/alias_multinomial.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a58dd8b7d9cac7890a0a6846130a0ebde54d987a Binary files /dev/null and b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/alias_multinomial.cpython-38.pyc differ diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/normalize.cpython-38.pyc b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/normalize.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..919d6dc319868c2888d54469c643585aecf18874 Binary files /dev/null and b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/normalize.cpython-38.pyc differ diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/utils.cpython-38.pyc b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34f9b8719b29a41d9e91d62fd0a3a4e17c63a5cc Binary files /dev/null and b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/__pycache__/utils.cpython-38.pyc differ diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/alias_multinomial.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/alias_multinomial.py new file mode 100644 index 0000000000000000000000000000000000000000..a3a0eb78684c849c4827dc294b3684dd75821f00 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/alias_multinomial.py @@ -0,0 +1,64 @@ +import torch +import numpy as np + +class AliasMethod(object): + ''' + From: https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ + ''' + def __init__(self, probs): + + if probs.sum() > 1: + probs.div_(probs.sum()) + K = len(probs) + self.prob = torch.zeros(K) + self.alias = torch.LongTensor([0]*K) + + # Sort the data into the outcomes with probabilities + # that are larger and smaller than 1/K. + smaller = [] + larger = [] + for kk, prob in enumerate(probs): + self.prob[kk] = K*prob + if self.prob[kk] < 1.0: + smaller.append(kk) + else: + larger.append(kk) + + # Loop though and create little binary mixtures that + # appropriately allocate the larger outcomes over the + # overall uniform mixture. + while len(smaller) > 0 and len(larger) > 0: + small = smaller.pop() + large = larger.pop() + + self.alias[small] = large + self.prob[large] = (self.prob[large] - 1.0) + self.prob[small] + + if self.prob[large] < 1.0: + smaller.append(large) + else: + larger.append(large) + + for last_one in smaller+larger: + self.prob[last_one] = 1 + + def cuda(self): + self.prob = self.prob.cuda() + self.alias = self.alias.cuda() + + def draw(self, N): + ''' + Draw N samples from multinomial + ''' + K = self.alias.size(0) + + kk = torch.zeros(N, dtype=torch.long, device=self.prob.device).random_(0, K) + prob = self.prob.index_select(0, kk) + alias = self.alias.index_select(0, kk) + # b is whether a random number is greater than q + b = torch.bernoulli(prob) + oq = kk.mul(b.long()) + oj = alias.mul((1-b).long()) + + return oq + oj + diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/normalize.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/normalize.py new file mode 100644 index 0000000000000000000000000000000000000000..208170b762b87b8b164a02c3f308c9432eee02fd --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/normalize.py @@ -0,0 +1,14 @@ +import torch +from torch.autograd import Variable +from torch import nn + +class Normalize(nn.Module): + + def __init__(self, power=2): + super(Normalize, self).__init__() + self.power = power + + def forward(self, x): + norm = x.pow(self.power).sum(1, keepdim=True).pow(1./self.power) + out = x.div(norm) + return out diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/utils.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ccc2c7fa155547c8976c3b598dca091a76f46600 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/lib/utils.py @@ -0,0 +1,16 @@ +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/load_vit_from_ckpt.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/load_vit_from_ckpt.py new file mode 100644 index 0000000000000000000000000000000000000000..cf9b84e39de79b0f9882317d4de4044e4280ac27 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/load_vit_from_ckpt.py @@ -0,0 +1,154 @@ +""" +Extracting backbone from a specified gcmae checkpoint. + +Example: + +python load_vit_from_ckpt.py \ + --checkpoint /home/workenv/label-efficient-dl/gcmae/gcmae/output/checkpoint-19.pth \ + --save-to ./output/final_models/ \ + --save-name vit_gcmae_16_224.pth \ + --num-classes 2 +""" + +import torchvision +import torch +import os +import argparse +from timm import create_model +# from net.models.vit import VisionTransformer + + +def gen_basic_weight(save_dir): + # Load timm vit weight + model = create_model('vit_base_patch16_224', pretrained=False, in_chans=3) + random_state_dict = model.state_dict() + + model = create_model('vit_base_patch16_224', pretrained=True, in_chans=3) + pretrained_state_dict = model.state_dict() + + # Save model + print(f'Saving backbone init weight to {save_dir}...') + if not os.path.exists(save_dir): + os.makedirs(save_dir) + torch.save(random_state_dict, os.path.join(save_dir, 'ViT_b16_224_Random_Init.pth')) + torch.save(pretrained_state_dict, os.path.join(save_dir, 'ViT_b16_224_Imagenet.pth')) + + +# -------------------------------------------------------- +# Interpolate position embeddings for high-resolution +# References: +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- +def interpolate_pos_embed(model, checkpoint_model): + if 'pos_embed' in checkpoint_model: + pos_embed_checkpoint = checkpoint_model['pos_embed'] + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = model.patch_embed.num_patches + num_extra_tokens = model.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches ** 0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + checkpoint_model['pos_embed'] = new_pos_embed + + +def main(args): + """Read ViT parameters from BYOL backbone + """ + + # Initialize model + if args.basic_weight: + model = create_model('vit_base_patch16_224', pretrained=False, in_chans=3) + # model = VisionTransformer(num_classes=args.num_classes) + + # Load basic weights (default initial parameters) + basic_weight = torch.load(args.basic_weight) + model.load_state_dict(basic_weight, strict=False) + else: + raise + model = create_model('vit_base_patch16_224', pretrained=True, in_chans=3) + + # Load checkpoint + # state_dict = torch.load(args.checkpoint)['state_dict'] + checkpoint = torch.load(args.checkpoint) + ckp_state_dict = checkpoint['model'] + model_state_dict = model.state_dict() + + # interpolate position embedding + interpolate_pos_embed(model, ckp_state_dict) + + print('checking checkpoint weights...') + # print(ckp_state_dict.keys()) + len_state_dict = len(ckp_state_dict) + for seq, src_k in enumerate(ckp_state_dict.keys()): + tgt_k = str(src_k) + if tgt_k not in model_state_dict.keys(): + print(f'{seq+1}/{len_state_dict} Skipped: {src_k}, {ckp_state_dict[src_k].shape}') + + print('loading weights...') + len_state_dict = len(model_state_dict) + for seq, tgt_k in enumerate(model_state_dict.keys()): + if tgt_k in ckp_state_dict: + # print(f'{seq+1}/{len_state_dict} Loaded: {ckp_state_dict[tgt_k].shape}, {model_state_dict[tgt_k].shape}') + model_state_dict[tgt_k] = ckp_state_dict[tgt_k] + else: + print(f'{seq+1}/{len_state_dict} Skipped: {tgt_k}') + + model.load_state_dict(model_state_dict, strict=False) + + # Save model + print(f'Saving model to {args.save_to}...') + if not os.path.exists(args.save_to): + os.makedirs(args.save_to) + torch.save(model.state_dict(), os.path.join(args.save_to, args.save_name)) + + +def get_args_parser(): + """Input parameters + """ + parser = argparse.ArgumentParser(description='Extract backbone state dict') + parser.add_argument('--checkpoint', default='./checkpoint_0004.pth.tar', type=str, required=True, + help='Path to the checkpoint') + parser.add_argument('--save-to', default='./output', type=str, required=True, + help='Where to save the model') + parser.add_argument('--save-name', default='vit_gcmae_16_224.pth', type=str, required=True, + help='Model save name') + parser.add_argument('--num-classes', default=2, type=int, + help='Number of classes to be classified') + parser.add_argument('--random-seed', default=42, type=int, + help='Random seed (enable reproduction)') + parser.add_argument('--basic-weight', default='', type=str, + help='Basic weight (used to init parameters)') + return parser + + +def setup_seed(seed): + """Fix up the random seed + + Args: + seed (int): Seed to be applied + """ + import random + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + random.seed(seed) + torch.backends.cudnn.deterministic = True + + +if __name__ == '__main__': + parser = get_args_parser() + args = parser.parse_args() + + setup_seed(args.random_seed) + main(args) \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/main_finetune.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/main_finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..1de6c453465689761b7827353cc5186b89a4c30f --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/main_finetune.py @@ -0,0 +1,363 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# DeiT: https://github.com/facebookresearch/deit +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# MAE: https://github.com/facebookresearch/mae +# -------------------------------------------------------- + +import argparse +import datetime +import json +import numpy as np +import os +import time +from pathlib import Path + +import torch +import torch.backends.cudnn as cudnn +from torch.utils.tensorboard import SummaryWriter + +import timm + +assert timm.__version__ == "0.3.2" # version check +from timm.models.layers import trunc_normal_ +from timm.data.mixup import Mixup +from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy + +import util.lr_decay as lrd +import util.misc as misc +from util.datasets import build_dataset +from util.pos_embed import interpolate_pos_embed +from util.misc import NativeScalerWithGradNormCount as NativeScaler + +import models_vit + +from engine_finetune import train_one_epoch, evaluate + + +def get_args_parser(): + parser = argparse.ArgumentParser('GCMAE fine-tuning for image classification', add_help=False) + parser.add_argument('--batch_size', default=128, type=int, + help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus') + parser.add_argument('--epochs', default=50, type=int) + parser.add_argument('--accum_iter', default=1, type=int, + help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)') + + # Model parameters + parser.add_argument('--model', default='vit_base_patch16', type=str, metavar='MODEL', + help='Name of model to train') + + parser.add_argument('--input_size', default=224, type=int, + help='images input size') + + parser.add_argument('--drop_path', type=float, default=0.1, metavar='PCT', + help='Drop path rate (default: 0.1)') + + # Optimizer parameters + parser.add_argument('--clip_grad', type=float, default=None, metavar='NORM', + help='Clip gradient norm (default: None, no clipping)') + parser.add_argument('--weight_decay', type=float, default=0.05, + help='weight decay (default: 0.05)') + + parser.add_argument('--lr', type=float, default=None, metavar='LR', + help='learning rate (absolute lr)') + parser.add_argument('--blr', type=float, default=1e-3, metavar='LR', + help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')#default=1e-3 + parser.add_argument('--layer_decay', type=float, default=0.75, + help='layer-wise lr decay from ELECTRA/BEiT') + + parser.add_argument('--min_lr', type=float, default=1e-6, metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0') + + parser.add_argument('--warmup_epochs', type=int, default=5, metavar='N', + help='epochs to warmup LR') + + # Augmentation parameters + parser.add_argument('--color_jitter', type=float, default=None, metavar='PCT', + help='Color jitter factor (enabled only when not using Auto/RandAug)') + parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME', + help='Use AutoAugment policy. "v0" or "original". " + "(default: rand-m9-mstd0.5-inc1)'), + parser.add_argument('--smoothing', type=float, default=0.1, + help='Label smoothing (default: 0.1)') + + # * Random Erase params + parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT', + help='Random erase prob (default: 0.25)') + parser.add_argument('--remode', type=str, default='pixel', + help='Random erase mode (default: "pixel")') + parser.add_argument('--recount', type=int, default=1, + help='Random erase count (default: 1)') + parser.add_argument('--resplit', action='store_true', default=False, + help='Do not random erase first (clean) augmentation split') + + # * Mixup params + parser.add_argument('--mixup', type=float, default=0, + help='mixup alpha, mixup enabled if > 0.') + parser.add_argument('--cutmix', type=float, default=0, + help='cutmix alpha, cutmix enabled if > 0.') + parser.add_argument('--cutmix_minmax', type=float, nargs='+', default=None, + help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)') + parser.add_argument('--mixup_prob', type=float, default=1.0, + help='Probability of performing mixup or cutmix when either/both is enabled') + parser.add_argument('--mixup_switch_prob', type=float, default=0.5, + help='Probability of switching to cutmix when both mixup and cutmix enabled') + parser.add_argument('--mixup_mode', type=str, default='batch', + help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"') + + # * Finetuning params + parser.add_argument('--finetune', default=' ', + help='finetune from checkpoint') + parser.add_argument('--global_pool', action='store_true') + parser.set_defaults(global_pool=False) + parser.add_argument('--cls_token', action='store_false', dest='global_pool', + help='Use class token instead of global pool for classification') + + # Dataset parameters + parser.add_argument('--data_path', default=' ', type=str, + help='dataset path') + parser.add_argument('--nb_classes', default=2, type=int, + help='number of the classification types') + + parser.add_argument('--output_dir', default=' ', + help='path where to save, empty for no saving') + parser.add_argument('--log_dir', default=' ', + help='path where to tensorboard log') + parser.add_argument('--device', default='cuda', + help='device to use for training / testing') + parser.add_argument('--seed', default=0, type=int) + parser.add_argument('--resume', default='', + help='resume from checkpoint') + + parser.add_argument('--start_epoch', default=0, type=int, metavar='N', + help='start epoch') + parser.add_argument('--eval', action='store_true', + help='Perform evaluation only') + parser.set_defaults(eval=False) + parser.add_argument('--dist_eval', action='store_true', default=False, + help='Enabling distributed evaluation (recommended during training for faster monitor') + parser.add_argument('--num_workers', default=16, type=int) + parser.add_argument('--pin_mem', action='store_true', + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') + parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') + parser.set_defaults(pin_mem=True) + + # distributed training parameters + parser.add_argument('--world_size', default=1, type=int, + help='number of distributed processes') + parser.add_argument('--local_rank', default=-1, type=int) + parser.add_argument('--dist_on_itp', action='store_true') + parser.add_argument('--dist_url', default='env://', + help='url used to set up distributed training') + parser.add_argument('--gpu_id', default=0, type=int, + help="the order of gpu") + + return parser + + +def main(args): + torch.cuda.set_device(args.gpu_id) + misc.init_distributed_mode(args) + + print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__)))) + print("{}".format(args).replace(', ', ',\n')) + + device = torch.device(args.device) + + # fix the seed for reproducibility + seed = args.seed + misc.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + + cudnn.benchmark = True + + dataset_train = build_dataset(is_train=True, args=args) + dataset_val = build_dataset(is_train=False, args=args) + + if True: # args.distributed: + num_tasks = misc.get_world_size() + global_rank = misc.get_rank() + sampler_train = torch.utils.data.DistributedSampler( + dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True + ) + print("Sampler_train = %s" % str(sampler_train)) + if args.dist_eval: + if len(dataset_val) % num_tasks != 0: + print('Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' + 'This will slightly alter validation results as extra duplicate entries are added to achieve ' + 'equal num of samples per-process.') + sampler_val = torch.utils.data.DistributedSampler( + dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=True) # shuffle=True to reduce monitor bias + else: + sampler_val = torch.utils.data.SequentialSampler(dataset_val) + else: + sampler_train = torch.utils.data.RandomSampler(dataset_train) + sampler_val = torch.utils.data.SequentialSampler(dataset_val) + + if global_rank == 0 and args.log_dir is not None and not args.eval: + os.makedirs(args.log_dir, exist_ok=True) + log_writer = SummaryWriter(log_dir=args.log_dir) + else: + log_writer = None + + data_loader_train = torch.utils.data.DataLoader( + dataset_train, sampler=sampler_train, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=args.pin_mem, + drop_last=True, + ) + + data_loader_val = torch.utils.data.DataLoader( + dataset_val, sampler=sampler_val, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=args.pin_mem, + drop_last=False + ) + + mixup_fn = None + mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None + if mixup_active: + print("Mixup is activated!") + mixup_fn = Mixup( + mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, + prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, + label_smoothing=args.smoothing, num_classes=args.nb_classes) + + model = models_vit.__dict__[args.model]( + num_classes=args.nb_classes, + drop_path_rate=args.drop_path, + global_pool=args.global_pool, + ) + + if args.finetune and not args.eval: + checkpoint = torch.load(args.finetune, map_location='cpu') + + print("Load pre-trained checkpoint from: %s" % args.finetune) + checkpoint_model = checkpoint['model'] + state_dict = model.state_dict() + for k in ['head.weight', 'head.bias']: + if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape: + print(f"Removing key {k} from pretrained checkpoint") + del checkpoint_model[k] + + # interpolate position embedding + interpolate_pos_embed(model, checkpoint_model) + + # load pre-trained model + msg = model.load_state_dict(checkpoint_model, strict=False) + print(msg) + + if args.global_pool: + assert set(msg.missing_keys) == {'head.weight', 'head.bias', 'fc_norm.weight', 'fc_norm.bias'} + else: + assert set(msg.missing_keys) == {'head.weight', 'head.bias'} + + # manually initialize fc layer + trunc_normal_(model.head.weight, std=2e-5) + + model.to(device) + + model_without_ddp = model + n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + + print("Model = %s" % str(model_without_ddp)) + print('number of params (M): %.2f' % (n_parameters / 1.e6)) + + eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() + + if args.lr is None: # only base_lr is specified + args.lr = args.blr * eff_batch_size / 256 + + print("base lr: %.2e" % (args.lr * 256 / eff_batch_size)) + print("actual lr: %.2e" % args.lr) + + print("accumulate grad iterations: %d" % args.accum_iter) + print("effective batch size: %d" % eff_batch_size) + + if args.distributed: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + model_without_ddp = model.module + + # build optimizer with layer-wise lr decay (lrd) + param_groups = lrd.param_groups_lrd(model_without_ddp, args.weight_decay, + no_weight_decay_list=model_without_ddp.no_weight_decay(), + layer_decay=args.layer_decay + ) + optimizer = torch.optim.AdamW(param_groups, lr=args.lr) + loss_scaler = NativeScaler() + + if mixup_fn is not None: + # smoothing is handled with mixup label transform + criterion = SoftTargetCrossEntropy() + elif args.smoothing > 0.: + criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) + else: + criterion = torch.nn.CrossEntropyLoss() + + print("criterion = %s" % str(criterion)) + + misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler) + + if args.eval: + test_stats, auc, precision, recall, f1, specificity = evaluate(data_loader_val, model, device, args.nb_classes) + print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.2f}%, AUC:{auc:.2f}%, precision {precision:.2f}%, recall {recall:.2f}%, f1_score {f1:.2f}%, specificity {specificity:.2f}%") + exit(0) + + print(f"Start training for {args.epochs} epochs") + start_time = time.time() + max_accuracy = 0.0 + max_auc = 0.0 + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + data_loader_train.sampler.set_epoch(epoch) + train_stats = train_one_epoch( + model, criterion, data_loader_train, + optimizer, device, epoch, loss_scaler, + args.clip_grad, mixup_fn, + log_writer=log_writer, + args=args + ) + if args.output_dir and (epoch % 1 == 0 or epoch + 1 == args.epochs): + misc.save_model( + args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, + loss_scaler=loss_scaler, epoch=epoch) + + test_stats, auc, precision, recall, f1, specificity = evaluate(data_loader_val, model, device, args.nb_classes) + print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.2f}%, AUC: {auc:.2f}%, precision {precision:.2f}%, recall {recall:.2f}%, f1_score {f1:.2f}%, specificity {specificity:.2f}%") + max_accuracy = max(max_accuracy, test_stats["acc1"]) + max_auc = max(max_auc, auc) + print(f'Max accuracy: {max_accuracy:.2f}%, Max AUC: {max_auc:.2f}%') + + if log_writer is not None: + log_writer.add_scalar('perf/test_acc1', test_stats['acc1'], epoch) + log_writer.add_scalar('perf/test_acc5', test_stats['acc5'], epoch) + log_writer.add_scalar('perf/test_loss', test_stats['loss'], epoch) + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + **{f'test_{k}': v for k, v in test_stats.items()}, + 'epoch': epoch, + 'n_parameters': n_parameters} + + if args.output_dir and misc.is_main_process(): + if log_writer is not None: + log_writer.flush() + with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: + f.write(json.dumps(log_stats) + "\n") + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + if args.output_dir: + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + main(args) diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/main_linprobe.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/main_linprobe.py new file mode 100644 index 0000000000000000000000000000000000000000..9c40958bc10db2151716b5b0efcead7210b7b759 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/main_linprobe.py @@ -0,0 +1,322 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# DeiT: https://github.com/facebookresearch/deit +# MoCo v3: https://github.com/facebookresearch/moco-v3 +# MAE: https://github.com/facebookresearch/mae +# -------------------------------------------------------- + +import argparse +import datetime +import json +from random import shuffle +import numpy as np +import os +import time +from pathlib import Path + +import torch +import torch.backends.cudnn as cudnn +from torch.utils.tensorboard import SummaryWriter +import torchvision.transforms as transforms +import torchvision.datasets as datasets + +import timm + +assert timm.__version__ == "0.3.2" # version check +from timm.models.layers import trunc_normal_ + +import util.misc as misc +from util.pos_embed import interpolate_pos_embed +from util.misc import NativeScalerWithGradNormCount as NativeScaler +from util.lars import LARS +from util.crop import RandomResizedCrop + +import models_vit +from engine_finetune import train_one_epoch, evaluate + + +def get_args_parser(): + parser = argparse.ArgumentParser('GCMAE linear probing for image classification', add_help=False) + parser.add_argument('--batch_size', default=512, type=int, + help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus') + parser.add_argument('--epochs', default=90, type=int) + parser.add_argument('--accum_iter', default=1, type=int, + help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)') + + # Model parameters + parser.add_argument('--model', default='vit_base_patch16', type=str, metavar='MODEL', + help='Name of model to train') + + # Optimizer parameters + parser.add_argument('--weight_decay', type=float, default=0, + help='weight decay (default: 0 for linear probe following MoCo v1)') + + parser.add_argument('--lr', type=float, default=None, metavar='LR', + help='learning rate (absolute lr)') + parser.add_argument('--blr', type=float, default=0.1, metavar='LR', + help='base learning rate: absolute_lr = base_lr * total_batch_size / 256') + + parser.add_argument('--min_lr', type=float, default=0., metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0') + + parser.add_argument('--warmup_epochs', type=int, default=10, metavar='N', + help='epochs to warmup LR') + + # * Finetuning params + parser.add_argument('--finetune', default=' ', + help='finetune from checkpoint') + parser.add_argument('--global_pool', action='store_true') + parser.set_defaults(global_pool=False) + parser.add_argument('--cls_token', action='store_false', dest='global_pool', + help='Use class token instead of global pool for classification') + + # Dataset parameters + parser.add_argument('--data_path_train', default=' ', type=str, + help='dataset train path') + parser.add_argument('--data_path_val', default=' ', type=str, + help='dataset val path') + parser.add_argument('--nb_classes', default=2, type=int, + help='number of the classification types') + + parser.add_argument('--output_dir', default=' ', + help='path where to save, empty for no saving') + parser.add_argument('--log_dir', default=' ', + help='path where to tensorboard log') + parser.add_argument('--device', default='cuda', + help='device to use for training / testing') + parser.add_argument('--seed', default=0, type=int) + parser.add_argument('--resume', default='', + help='resume from checkpoint') + + parser.add_argument('--start_epoch', default=0, type=int, metavar='N', + help='start epoch') + parser.add_argument('--eval', action='store_true', + help='Perform evaluation only') + parser.add_argument('--dist_eval', action='store_true', default=False, + help='Enabling distributed evaluation (recommended during training for faster monitor') + parser.add_argument('--num_workers', default=20, type=int) + parser.add_argument('--pin_mem', action='store_true', + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') + parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') + parser.set_defaults(pin_mem=True) + + # distributed training parameters + parser.add_argument('--world_size', default=1, type=int, + help='number of distributed processes') + parser.add_argument('--local_rank', default=-1, type=int) + parser.add_argument('--dist_on_itp', action='store_true') + parser.add_argument('--dist_url', default='env://', + help='url used to set up distributed training') + parser.add_argument('--gpu_id', default=0, type=int, + help="the order of gpu") + return parser + + +def main(args): + torch.cuda.set_device(args.gpu_id) + misc.init_distributed_mode(args) + + print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__)))) + print("{}".format(args).replace(', ', ',\n')) + + device = torch.device(args.device) + + # fix the seed for reproducibility + seed = args.seed + misc.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + + cudnn.benchmark = True + + # linear probe: weak augmentation + transform_train = transforms.Compose([ + RandomResizedCrop(224, interpolation=3), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(mean=[0.6790435, 0.5052883, 0.66902906], std= [0.19158737, 0.2039779, 0.15648715])]) + transform_val = transforms.Compose([ + transforms.Resize(256, interpolation=3), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize(mean=[0.6790435, 0.5052883, 0.66902906], std= [0.19158737, 0.2039779, 0.15648715])]) + dataset_train = datasets.ImageFolder(args.data_path_train, transform=transform_train) + dataset_val = datasets.ImageFolder(args.data_path_val, transform=transform_val) + print(dataset_train) + print(dataset_val) + + # if True: # args.distributed: + # num_tasks = misc.get_world_size() + # global_rank = misc.get_rank() + # sampler_train = torch.utils.data.DistributedSampler( + # dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True + # ) + # print("Sampler_train = %s" % str(sampler_train)) + # if args.dist_eval: + # if len(dataset_val) % num_tasks != 0: + # print('Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' + # 'This will slightly alter validation results as extra duplicate entries are added to achieve ' + # 'equal num of samples per-process.') + # sampler_val = torch.utils.data.DistributedSampler( + # dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=True) # shuffle=True to reduce monitor bias + # else: + # sampler_val = torch.utils.data.SequentialSampler(dataset_val) + # else: + # sampler_train = torch.utils.data.RandomSampler(dataset_train) + # sampler_val = torch.utils.data.SequentialSampler(dataset_val) + + if args.log_dir is not None and not args.eval: + os.makedirs(args.log_dir, exist_ok=True) + log_writer = SummaryWriter(log_dir=args.log_dir) + else: + log_writer = None + + data_loader_train = torch.utils.data.DataLoader( + dataset_train, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=args.pin_mem, + shuffle = True, + drop_last=True, + ) + + data_loader_val = torch.utils.data.DataLoader( + dataset_val, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=args.pin_mem, + drop_last=False + ) + + model = models_vit.__dict__[args.model]( + num_classes=args.nb_classes, + global_pool=args.global_pool, + ) + + if args.finetune and not args.eval: + checkpoint = torch.load(args.finetune, map_location='cpu') + + print("Load pre-trained checkpoint from: %s" % args.finetune) + checkpoint_model = checkpoint['model'] + state_dict = model.state_dict() + for k in ['head.weight', 'head.bias']: + if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape: + print(f"Removing key {k} from pretrained checkpoint") + del checkpoint_model[k] + + # interpolate position embedding + interpolate_pos_embed(model, checkpoint_model) + + # load pre-trained model + msg = model.load_state_dict(checkpoint_model, strict=False) + print(msg) + + if args.global_pool: + assert set(msg.missing_keys) == {'head.weight', 'head.bias', 'fc_norm.weight', 'fc_norm.bias'} + else: + assert set(msg.missing_keys) == {'head.weight', 'head.bias'} + + # manually initialize fc layer: following MoCo v3 + trunc_normal_(model.head.weight, std=0.01) + + # for linear prob only + # hack: revise model's head with BN + model.head = torch.nn.Sequential(torch.nn.BatchNorm1d(model.head.in_features, affine=False, eps=1e-6), model.head) + # freeze all but the head + for _, p in model.named_parameters(): + p.requires_grad = False + for _, p in model.head.named_parameters(): + p.requires_grad = True + + model.to(device) + + model_without_ddp = model + n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + + print("Model = %s" % str(model_without_ddp)) + print('number of params (M): %.2f' % (n_parameters / 1.e6)) + + eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() + + if args.lr is None: # only base_lr is specified + args.lr = args.blr * eff_batch_size / 256 + + print("base lr: %.2e" % (args.lr * 256 / eff_batch_size)) + print("actual lr: %.2e" % args.lr) + + print("accumulate grad iterations: %d" % args.accum_iter) + print("effective batch size: %d" % eff_batch_size) + + if args.distributed: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + model_without_ddp = model.module + + optimizer = LARS(model_without_ddp.head.parameters(), lr=args.lr, weight_decay=args.weight_decay) + print(optimizer) + loss_scaler = NativeScaler() + + criterion = torch.nn.CrossEntropyLoss() + + print("criterion = %s" % str(criterion)) + + misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler) + + if args.eval: + test_stats, auc, precision, recall, f1, specificity = evaluate(data_loader_val, model, device, args.nb_classes) + print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.2f}%, AUC:{auc:.2f}%, precision {precision:.2f}%, recall {recall:.2f}%, f1_score {f1:.2f}%, specificity {specificity:.2f}%") + exit(0) + + print(f"Start training for {args.epochs} epochs") + start_time = time.time() + max_accuracy = 0.0 + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + data_loader_train.sampler.set_epoch(epoch) + train_stats = train_one_epoch( + model, criterion, data_loader_train, + optimizer, device, epoch, loss_scaler, + max_norm=None, + log_writer=log_writer, + args=args + ) + if args.output_dir: + misc.save_model( + args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, + loss_scaler=loss_scaler, epoch=epoch) + + test_stats, auc, precision, recall, f1, specificity = evaluate(data_loader_val, model, device, args.nb_classes) + print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.2f}%, AUC: {auc:.2f}%, precision {precision:.2f}%, recall {recall:.2f}%, f1_score {f1:.2f}%, specificity {specificity:.2f}%") + max_accuracy = max(max_accuracy, test_stats["acc1"]) + max_auc = max(max_auc, auc) + print(f'Max accuracy: {max_accuracy:.2f}%, Max AUC: {max_auc:.2f}%') + + if log_writer is not None: + log_writer.add_scalar('perf/test_acc1', test_stats['acc1'], epoch) + log_writer.add_scalar('perf/test_acc5', test_stats['acc5'], epoch) + log_writer.add_scalar('perf/test_loss', test_stats['loss'], epoch) + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + **{f'test_{k}': v for k, v in test_stats.items()}, + 'epoch': epoch, + 'n_parameters': n_parameters} + + if args.output_dir and misc.is_main_process(): + if log_writer is not None: + log_writer.flush() + with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: + f.write(json.dumps(log_stats) + "\n") + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + if args.output_dir: + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + main(args) diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/main_pretrain.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/main_pretrain.py new file mode 100644 index 0000000000000000000000000000000000000000..400a117fb03b026cd0edcdfc630b6f569b301b53 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/main_pretrain.py @@ -0,0 +1,260 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# DeiT: https://github.com/facebookresearch/deit +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# MAE: https://github.com/facebookresearch/mae +# -------------------------------------------------------- +import argparse +import datetime +import json +from random import shuffle +import numpy as np +import os +import time +from pathlib import Path + +import torch +import torch.backends.cudnn as cudnn +from torch.utils.tensorboard import SummaryWriter +import torchvision.transforms as transforms +import torchvision.datasets as datasets + +import timm + +assert timm.__version__ == "0.3.2" # version check +import timm.optim.optim_factory as optim_factory + +import util.misc as misc +from util.misc import NativeScalerWithGradNormCount as NativeScaler + +import models_gcmae + +from engine_pretrain import train_one_epoch +from lib.NCEAverage import NCEAverage +from lib.NCECriterion import NCECriterion +from test_npid import NN, kNN + +from torch.utils.data.distributed import DistributedSampler + +def get_args_parser(): + parser = argparse.ArgumentParser('GCMAE pre-training', add_help=False) + parser.add_argument('--batch_size', default=128, type=int, + help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus') + parser.add_argument('--epochs', default=80, type=int) + parser.add_argument('--accum_iter', default=1, type=int, + help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)') + + # Model parameters + parser.add_argument('--model', default='gcmae_vit_base_patch16', type=str, metavar='MODEL', + help='Name of model to train') + + parser.add_argument('--input_size', default=224, type=int, + help='images input size') + + parser.add_argument('--mask_ratio', default=0.5, type=float, + help='Masking ratio (percentage of removed patches).') + + parser.add_argument('--norm_pix_loss', action='store_true', + help='Use (per-patch) normalized pixels as targets for computing loss') + parser.set_defaults(norm_pix_loss=True) + + # Npid parameters + parser.add_argument('--low_dim', default=768, type=int, help='Low dimension') + parser.add_argument('--nce_k', default=8192, type=int, help='NCE k') + parser.add_argument('--nce_t', default=0.07, type=float, help='NCE t') + parser.add_argument('--nce_m', default=0.5, type=float, help='NCE m') + # Optimizer parameters + parser.add_argument('--weight_decay', type=float, default=0.05, + help='weight decay (default: 0.05)') + + parser.add_argument('--lr', type=float, default=None, metavar='LR', + help='learning rate (absolute lr)') + parser.add_argument('--blr', type=float, default=1e-3, metavar='LR', + help='base learning rate: absolute_lr = base_lr * total_batch_size / 256') + parser.add_argument('--min_lr', type=float, default=0., metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0') + + parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N', + help='epochs to warmup LR') + + # Dataset parameters + parser.add_argument('--data_path', default=' ', type=str, + help='dataset path') + + # parser.add_argument('--data_val_path', default=' ', type=str, + # help='dataset val path') + parser.add_argument('--output_dir', default=' ', + help='path where to save, empty for no saving') + parser.add_argument('--log_dir', default=' ', + help='path where to tensorboard log') + parser.add_argument('--device', default='cuda', + help='device to use for training / testing') + parser.add_argument('--seed', default=0, type=int) + parser.add_argument('--resume', default='', + help='resume from checkpoint') + + parser.add_argument('--start_epoch', default=0, type=int, metavar='N', + help='start epoch') + parser.add_argument('--num_workers', default=20, type=int) + parser.add_argument('--pin_mem', action='store_true', + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') + parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') + parser.set_defaults(pin_mem=True) + + # distributed training parameters + parser.add_argument('--world_size', default=1, type=int, + help='number of distributed processes') + parser.add_argument('--local_rank', default=-1, type=int) + parser.add_argument('--dist_on_itp', action='store_true') + parser.add_argument('--dist_url', default='env://', + help='url used to set up distributed training') + parser.add_argument('--gpu_id', default=0, type=int, + help="the order of gpu") + + # add: init weight + parser.add_argument('--init_weight_pth', default='', type=str, + help="init weight path") + + return parser + +class ImageFolderInstance(datasets.ImageFolder): + def __getitem__(self, index): + + path, target = self.imgs[index] + img = self.loader(path) + if self.transform is not None: + img = self.transform(img) + if self.target_transform is not None: + target = self.target_transform(target) + + return img, target, index + +def main(args): + torch.cuda.set_device(args.gpu_id) + misc.init_distributed_mode(args) + print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__)))) + print("{}".format(args).replace(', ', ',\n')) + + device = torch.device(args.device) + + # fix the seed for reproducibility + seed = args.seed + misc.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + + cudnn.benchmark = True + + # simple augmentation + transform_data = transforms.Compose([ + transforms.RandomResizedCrop(args.input_size, scale=(0.2, 1.0), interpolation=3), # 3 is bicubic + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(mean=[0.6790435, 0.5052883, 0.66902906], std=[0.19158737, 0.2039779, 0.15648715])]) + dataset_train = ImageFolderInstance(args.data_path, transform=transform_data) + + print(dataset_train) + + if args.log_dir is not None: + os.makedirs(args.log_dir, exist_ok=True) + log_writer = SummaryWriter(log_dir=args.log_dir) + else: + log_writer = None + + if args.distributed: + datasampler = DistributedSampler(dataset_train, rank=misc.get_rank(), shuffle=True, drop_last=True) + else: + datasampler = None + data_loader_train = torch.utils.data.DataLoader( + dataset_train, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=args.pin_mem, + shuffle=(datasampler is None), + drop_last=(datasampler is None), + sampler=datasampler + ) + + ndata = dataset_train.__len__() + + lemniscate = NCEAverage(args.low_dim, ndata, args.nce_k, args.nce_t, args.nce_m) + criterion = NCECriterion(ndata) + + model = models_gcmae.__dict__[args.model](norm_pix_loss=args.norm_pix_loss, lemniscate=lemniscate, criterion=criterion, args= args) + + # load weight from file + if args.init_weight_pth: + print(f'Loading weight from {args.init_weight_pth}...') + init_weight = torch.load(args.init_weight_pth) + model.load_state_dict(init_weight, strict=False) + print('Weight loaded.') + + model.to(device) + + model_without_ddp = model + print("Model = %s" % str(model_without_ddp)) + + eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() + + if args.lr is None: # only base_lr is specified + args.lr = args.blr * eff_batch_size / 256 + + print("base lr: %.2e" % (args.lr * 256 / eff_batch_size)) + print("actual lr: %.2e" % args.lr) + + print("accumulate grad iterations: %d" % args.accum_iter) + print("effective batch size: %d" % eff_batch_size) + + if args.distributed: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) + model_without_ddp = model.module + # following timm: set wd as 0 for bias and norm layers + param_groups = optim_factory.add_weight_decay(model_without_ddp, args.weight_decay) + optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) + print(optimizer) + loss_scaler = NativeScaler() + + misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler) + + print(f"Start training for {args.epochs} epochs") + start_time = time.time() + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + data_loader_train.sampler.set_epoch(epoch) + train_stats = train_one_epoch( + model, data_loader_train, + optimizer, device, epoch, loss_scaler, + log_writer=log_writer, + args=args, + lemniscate = lemniscate, + ) + + if args.output_dir and (epoch % 20 == 0 or epoch + 1 == args.epochs): + misc.save_model( + args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, + loss_scaler=loss_scaler, epoch=epoch) + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + 'epoch': epoch,} + + if args.output_dir and misc.is_main_process(): + if log_writer is not None: + log_writer.flush() + with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: + f.write(json.dumps(log_stats) + "\n") + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + if args.output_dir: + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + main(args) \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/models_encoder.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/models_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..f890413383b6c463550a844d3885bb03ebedecc8 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/models_encoder.py @@ -0,0 +1,74 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- + +from functools import partial + +import torch +import torch.nn as nn + +import timm.models.vision_transformer + + +class VisionTransformer(timm.models.vision_transformer.VisionTransformer): + """ Vision Transformer with support for global average pooling + """ + def __init__(self, global_pool=False, **kwargs): + super(VisionTransformer, self).__init__(**kwargs) + + self.global_pool = global_pool + if self.global_pool: + norm_layer = kwargs['norm_layer'] + embed_dim = kwargs['embed_dim'] + self.fc_norm = norm_layer(embed_dim) + + del self.norm # remove the original norm + + def forward(self, x): + B = x.shape[0] + x = self.patch_embed(x) + + cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + x = x + self.pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + if self.global_pool: + x = x[:, 1:, :].mean(dim=1) # global pool without cls token + outcome = self.fc_norm(x) + else: + x = self.norm(x) + outcome = x[:, 0] + + return outcome + + +def vit_base_patch16(**kwargs): + model = VisionTransformer( + patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def vit_large_patch16(**kwargs): + model = VisionTransformer( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def vit_huge_patch14(**kwargs): + model = VisionTransformer( + patch_size=14, embed_dim=1280, depth=32, num_heads=16, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/models_gcmae.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/models_gcmae.py new file mode 100644 index 0000000000000000000000000000000000000000..a92895fec6fbdb1f4f53f5f1bea37847a583d928 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/models_gcmae.py @@ -0,0 +1,298 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- + +from functools import partial +from re import X + +import torch +import torch.nn as nn + +from timm.models.vision_transformer import PatchEmbed, Block + +from util.pos_embed import get_2d_sincos_pos_embed + +from lib.normalize import Normalize + +import torch.nn.functional as F +class MaskedAutoencoderViT(nn.Module): + """ Masked Autoencoder with VisionTransformer backbone + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, + embed_dim=1024, depth=24, num_heads=16, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4., norm_layer=nn.LayerNorm, norm_pix_loss=False, lemniscate=None, criterion=None,args=None, + iter_size = 1): + super().__init__() + self.args = args + #self.fc = nn.Sequential(nn.Linear(embed_dim, embed_dim), nn.ReLU(), nn.Linear(embed_dim, self.args.low_dim)) + + + + # -------------------------------------------------------------------------- + # MAE encoder specifics + self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim), requires_grad=False) # fixed sin-cos embedding + + self.blocks = nn.ModuleList([ + Block(embed_dim, num_heads, mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) + # -------------------------------------------------------------------------- + self.lemniscate = lemniscate + self.criterion = criterion + self.iter_size = iter_size + self.l2norm = Normalize(2) + # -------------------------------------------------------------------------- + # MAE decoder specifics + self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim, bias=True) + + self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim)) + + self.decoder_pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, decoder_embed_dim), requires_grad=False) # fixed sin-cos embedding + + self.decoder_blocks = nn.ModuleList([ + Block(decoder_embed_dim, decoder_num_heads, mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer) + for i in range(decoder_depth)]) + + self.decoder_norm = norm_layer(decoder_embed_dim) + self.decoder_pred = nn.Linear(decoder_embed_dim, patch_size**2 * in_chans, bias=True) # decoder to patch + # -------------------------------------------------------------------------- + + self.norm_pix_loss = norm_pix_loss + + self.initialize_weights() + + def initialize_weights(self): + # initialization + # initialize (and freeze) pos_embed by sin-cos embedding + pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.patch_embed.num_patches**.5), cls_token=True) + self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0)) + + decoder_pos_embed = get_2d_sincos_pos_embed(self.decoder_pos_embed.shape[-1], int(self.patch_embed.num_patches**.5), cls_token=True) + self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0)) + + # initialize patch_embed like nn.Linear (instead of nn.Conv2d) + w = self.patch_embed.proj.weight.data + torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) + + # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.) + torch.nn.init.normal_(self.cls_token, std=.02) + torch.nn.init.normal_(self.mask_token, std=.02) + + # initialize nn.Linear and nn.LayerNorm + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + # we use xavier_uniform following official JAX ViT: + torch.nn.init.xavier_uniform_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def patchify(self, imgs): + """ + imgs: (N, 3, H, W) + x: (N, L, patch_size**2 *3) + """ + p = self.patch_embed.patch_size[0] + assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0 + + h = w = imgs.shape[2] // p + x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p)) + x = torch.einsum('nchpwq->nhwpqc', x) + x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3)) + return x + + def unpatchify(self, x): + """ + x: (N, L, patch_size**2 *3) + imgs: (N, 3, H, W) + """ + p = self.patch_embed.patch_size[0] + h = w = int(x.shape[1]**.5) + assert h * w == x.shape[1] + + x = x.reshape(shape=(x.shape[0], h, w, p, p, 3)) + x = torch.einsum('nhwpqc->nchpwq', x) + imgs = x.reshape(shape=(x.shape[0], 3, h * p, h * p)) + return imgs + def outside_block_fix(self, x, mask_ratio): + N, L, D = x.shape # batch, length, dim + h = w = int((L * (1 - mask_ratio))**.5) + keep = torch.zeros([N, h, w], device=x.device) + pad = nn.ConstantPad2d((3,4,3,4), 1) + mask = pad(keep).flatten(1) + return mask + + + def random_masking(self, x, mask_ratio, random = True): + """ + Perform per-sample random masking by per-sample shuffling. + Per-sample shuffling is done by argsort random noise. + x: [N, L, D], sequence + """ + N, L, D = x.shape # batch, length, dim + len_keep = int(L * (1 - mask_ratio)) + if random: + noise = torch.rand(N, L, device=x.device) # noise in [0, 1] + else: + noise = self.outside_block_fix(x, mask_ratio) + # sort noise for each sample + ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove + ids_restore = torch.argsort(ids_shuffle, dim=1) + + # keep the first subset + ids_keep = ids_shuffle[:, :len_keep] + x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D)) + + # generate the binary mask: 0 is keep, 1 is remove + mask = torch.ones([N, L], device=x.device) + mask[:, :len_keep] = 0 + # unshuffle to get the binary mask + mask = torch.gather(mask, dim=1, index=ids_restore) + + return x_masked, mask, ids_restore + + def forward_encoder(self, x, mask_ratio): + # embed patches + x = self.patch_embed(x) + + # add pos embed w/o cls token + x = x + self.pos_embed[:, 1:, :] + + # masking: length -> length * mask_ratio + x, mask, ids_restore = self.random_masking(x, mask_ratio, True) + + # append cls token + cls_token = self.cls_token + self.pos_embed[:, :1, :] + cls_tokens = cls_token.expand(x.shape[0], -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + + # apply Transformer blocks + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + return x, mask, ids_restore + + def forward_decoder(self, x, ids_restore): + # embed tokens + x = self.decoder_embed(x) + + # append mask tokens to sequence + mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1) + x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1) # no cls token + x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2])) # unshuffle + x = torch.cat([x[:, :1, :], x_], dim=1) # append cls token + + # add pos embed + x = x + self.decoder_pos_embed + + # apply Transformer blocks + for blk in self.decoder_blocks: + x = blk(x) + x = self.decoder_norm(x) + + # predictor projection + x = self.decoder_pred(x) + + # remove cls token + x = x[:, 1:, :] + + return x + def forward_npid(self, x): + x = x.mean(dim=1) + #x = self.fc(x) + x = self.l2norm(x) + return x + def forward_npid_loss(self, x, index): + output = self.lemniscate(x, index) #index [256] output P(i|v) x [64, 50, 768] + loss = self.criterion(output, index) / self.iter_size + return output, loss + + def forward_loss(self, imgs, pred, mask): + """ + imgs: [N, 3, H, W] + pred: [N, L, p*p*3] + mask: [N, L], 0 is keep, 1 is remove, + """ + target = self.patchify(imgs) + if self.norm_pix_loss: + mean = target.mean(dim=-1, keepdim=True) + var = target.var(dim=-1, keepdim=True) + target = (target - mean) / (var + 1.e-6)**.5 + + loss = (pred - target) ** 2 + loss = loss.mean(dim=-1) # [N, L], mean loss per patch + + loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches + return loss + + def forward(self, imgs, mask_ratio=0.75, index = None, is_train=False, mae=False, npid=False, npid_feature=False): + latent, mask, ids_restore = self.forward_encoder(imgs, mask_ratio) + #latent [256, 50, 768] + if is_train: + npid_x = self.forward_npid(latent) + output_npid, loss_npid = self.forward_npid_loss(npid_x, index) + + pred = self.forward_decoder(latent, ids_restore) # [N, L, p*p*3] + loss = self.forward_loss(imgs, pred, mask) + return loss, pred, mask, loss_npid, output_npid + elif mae: + pred = self.forward_decoder(latent, ids_restore) # [N, L, p*p*3] + loss = self.forward_loss(imgs, pred, mask) + return loss, pred, mask, None, None + elif npid: + npid_x = self.forward_npid(latent) + output_npid, loss_npid = self.forward_npid_loss(npid_x, index) + return None, None, None, loss_npid, output_npid + elif npid_feature: + npid_x = self.forward_npid(latent) + return npid_x + +def gcmae_vit_base_patch16_dec512d8b(**kwargs): + model = MaskedAutoencoderViT( + patch_size=16, embed_dim=768, depth=12, num_heads=12, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def gcmae_vit_large_patch16_dec512d8b(**kwargs): + model = MaskedAutoencoderViT( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def gcmae_vit_huge_patch14_dec512d8b(**kwargs): + model = MaskedAutoencoderViT( + patch_size=14, embed_dim=1280, depth=32, num_heads=16, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +# set recommended archs +gcmae_vit_base_patch16 = gcmae_vit_base_patch16_dec512d8b # decoder: 512 dim, 8 blocks +gcmae_vit_large_patch16 = gcmae_vit_large_patch16_dec512d8b # decoder: 512 dim, 8 blocks +gcmae_vit_huge_patch14 = gcmae_vit_huge_patch14_dec512d8b # decoder: 512 dim, 8 blocks + + + + diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/models_vit.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/models_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..2244a17ab7ce7193d560b1f6938c22b670907a06 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/models_vit.py @@ -0,0 +1,74 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- + +from functools import partial + +import torch +import torch.nn as nn + +import timm.models.vision_transformer + + +class VisionTransformer(timm.models.vision_transformer.VisionTransformer): + """ Vision Transformer with support for global average pooling + """ + def __init__(self, global_pool=False, **kwargs): + super(VisionTransformer, self).__init__(**kwargs) + + self.global_pool = global_pool + if self.global_pool: + norm_layer = kwargs['norm_layer'] + embed_dim = kwargs['embed_dim'] + self.fc_norm = norm_layer(embed_dim) + + del self.norm # remove the original norm + + def forward_features(self, x): + B = x.shape[0] + x = self.patch_embed(x) + + cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + x = x + self.pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + if self.global_pool: + x = x[:, 1:, :].mean(dim=1) # global pool without cls token + outcome = self.fc_norm(x) + else: + x = self.norm(x) + outcome = x[:, 0] + + return outcome + + +def vit_base_patch16(**kwargs): + model = VisionTransformer( + patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def vit_large_patch16(**kwargs): + model = VisionTransformer( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def vit_huge_patch14(**kwargs): + model = VisionTransformer( + patch_size=14, embed_dim=1280, depth=32, num_heads=16, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/nohup.out b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/nohup.out new file mode 100644 index 0000000000000000000000000000000000000000..4fc73e466691ac13d44bd37963f83bcb3df8fc0a --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/nohup.out @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0196109994c58e9b073c1f239463dba4a9fd1f714ccb919b1f6989233284fb13 +size 10860427 diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/output/log.txt b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/output/log.txt new file mode 100644 index 0000000000000000000000000000000000000000..de03ade4b03833522e0896d16cc7bb602abb2308 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/output/log.txt @@ -0,0 +1,20 @@ +{"train_lr": 1.249928325688073e-05, "train_loss_all": 1.6951392864893882, "train_loss_mae": 0.6232919644752364, "train_loss_npid": 10.718472986538476, "epoch": 0} +{"train_lr": 3.7499283256880734e-05, "train_loss_all": 1.2544193148886391, "train_loss_mae": 0.29345881425168946, "train_loss_npid": 9.609604887513939, "epoch": 1} +{"train_lr": 6.249928325688072e-05, "train_loss_all": 0.9934084515198381, "train_loss_mae": 0.23100289355433726, "train_loss_npid": 7.6240554602867965, "epoch": 2} +{"train_lr": 8.749928325688076e-05, "train_loss_all": 0.8070334031836155, "train_loss_mae": 0.2147223876712314, "train_loss_npid": 5.923110032737802, "epoch": 3} +{"train_lr": 0.00011249928325688074, "train_loss_all": 0.6624781838387525, "train_loss_mae": 0.20488846144732942, "train_loss_npid": 4.575897163918259, "epoch": 4} +{"train_lr": 0.00013749928325688075, "train_loss_all": 0.5504872641466353, "train_loss_mae": 0.19673029863352487, "train_loss_npid": 3.5375695937270417, "epoch": 5} +{"train_lr": 0.00016249928325688076, "train_loss_all": 0.4680212429699001, "train_loss_mae": 0.18649238266153345, "train_loss_npid": 2.8152885437285136, "epoch": 6} +{"train_lr": 0.0001874992832568808, "train_loss_all": 0.4125429083225787, "train_loss_mae": 0.17726122452193924, "train_loss_npid": 2.352816803630339, "epoch": 7} +{"train_lr": 0.0002124992832568808, "train_loss_all": 0.3781979799766196, "train_loss_mae": 0.17158613237253295, "train_loss_npid": 2.0661184463274043, "epoch": 8} +{"train_lr": 0.00023749928325688058, "train_loss_all": 0.3560920784813822, "train_loss_mae": 0.16740265586328001, "train_loss_npid": 1.886894195021019, "epoch": 9} +{"train_lr": 0.00024795413078163535, "train_loss_all": 0.339586502899749, "train_loss_mae": 0.16270318633338454, "train_loss_npid": 1.768833134712976, "epoch": 10} +{"train_lr": 0.00023591887458797715, "train_loss_all": 0.3261361276269506, "train_loss_mae": 0.15826478182346723, "train_loss_npid": 1.678713425973413, "epoch": 11} +{"train_lr": 0.0002130261061324063, "train_loss_all": 0.31538585492606286, "train_loss_mae": 0.1542001661195189, "train_loss_npid": 1.6118568577350827, "epoch": 12} +{"train_lr": 0.00018151672909465076, "train_loss_all": 0.3060923837296186, "train_loss_mae": 0.15042925238429886, "train_loss_npid": 1.5566312829767346, "epoch": 13} +{"train_lr": 0.0001444751008379059, "train_loss_all": 0.29751537771673375, "train_loss_mae": 0.14653637044798207, "train_loss_npid": 1.5097900425212099, "epoch": 14} +{"train_lr": 0.00010552711402013923, "train_loss_all": 0.28930359132622085, "train_loss_mae": 0.14255208674809733, "train_loss_npid": 1.467515015848186, "epoch": 15} +{"train_lr": 6.848526895765559e-05, "train_loss_all": 0.2819806246206574, "train_loss_mae": 0.13879279353049234, "train_loss_npid": 1.43187828023636, "epoch": 16} +{"train_lr": 3.6975479530882305e-05, "train_loss_all": 0.27513516249493997, "train_loss_mae": 0.13510516505397924, "train_loss_npid": 1.400299943135966, "epoch": 17} +{"train_lr": 1.408214347052291e-05, "train_loss_all": 0.2700371737241608, "train_loss_mae": 0.13246216934490437, "train_loss_npid": 1.3757500144456505, "epoch": 18} +{"train_lr": 2.046220017417465e-06, "train_loss_all": 0.26699303928762674, "train_loss_mae": 0.13090999836503708, "train_loss_npid": 1.3608303785324096, "epoch": 19} diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/pretrain.sh b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/pretrain.sh new file mode 100644 index 0000000000000000000000000000000000000000..af15be241dc8b494ffe8c8939b4a01a395a321a9 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/pretrain.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# ps -ef | grep pretrain | awk '{print $2}' |xargs kill + +# Training settings +pretrain_model="timm" +dataset="All" +model_weights="/root/autodl-tmp/model_base/ViT_b16_224_Imagenet.pth" + +# Init params +data_path="/root/autodl-tmp/datasets/${dataset}" +model_name="ViT_b16_224_timm_GCMAE_ALL_80.pth" +checkpoint_path="/root/autodl-tmp/LSQ/checkpoint/${pretrain_model}" +save_weight_path="/root/autodl-tmp/LSQ/model_saved/" +tensorboard_path="/root/tf-logs/" + +# Training. Save checkpoint every 20 epochs. +# The checkpoint and backbone model will be available under checkpoint_path folder. +set -e + +# train +python -u -m torch.distributed.launch \ + --nproc_per_node 4 \ + main_pretrain.py \ + --data_path $data_path \ + --output_dir $checkpoint_path \ + --log_dir $tensorboard_path \ + --batch_size 64 \ + --model gcmae_vit_base_patch16 \ + --norm_pix_loss \ + --mask_ratio 0.5 \ + --epochs 80 \ + --warmup_epochs 40 \ + --blr 1e-3 --weight_decay 0.05 \ + --low_dim 768 \ + --nce_k 8192 \ + --nce_t 0.07 \ + --nce_m 0.5 \ + --init_weight_pth $model_weights + +# extract & save model +python -u load_vit_from_ckpt.py \ + --basic-weight ${model_weights} \ + --checkpoint ${checkpoint_path}/checkpoint-79.pth \ + --save-to $save_weight_path \ + --save-name $model_name \ + --num-classes 2 + +set +e + +# # packup checkpoints +# nohup zip GCMAE_2.zip checkpoint-0.pth & +# nohup zip GCMAE_3.zip checkpoint-20.pth & +# nohup zip GCMAE_4.zip checkpoint-40.pth & +# nohup zip GCMAE_5.zip checkpoint-60.pth & +# nohup zip GCMAE_6.zip checkpoint-79.pth & \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/requirements.txt b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4285171bad0b03f89a37ef1320178d012d7abd46 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/requirements.txt @@ -0,0 +1,13 @@ +matplotlib +tensorboardx +opencv-python +pandas +Pillow +scikit-image +scikit-learn +scipy +seaborn +sentry-sdk +urllib3 +tensorboard +tqdm \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/submitit_finetune.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/submitit_finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..cce5883bccac0329ca67f99d6a56219afe31425d --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/submitit_finetune.py @@ -0,0 +1,131 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# A script to run multinode training with submitit. +# -------------------------------------------------------- + +import argparse +import os +import uuid +from pathlib import Path + +import main_finetune as classification +import submitit + + +def parse_args(): + classification_parser = classification.get_args_parser() + parser = argparse.ArgumentParser("Submitit for MAE finetune", parents=[classification_parser]) + parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node") + parser.add_argument("--nodes", default=2, type=int, help="Number of nodes to request") + parser.add_argument("--timeout", default=4320, type=int, help="Duration of the job") + parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.") + + parser.add_argument("--partition", default="learnfair", type=str, help="Partition where to submit") + parser.add_argument("--use_volta32", action='store_true', help="Request 32G V100 GPUs") + parser.add_argument('--comment', default="", type=str, help="Comment to pass to scheduler") + return parser.parse_args() + + +def get_shared_folder() -> Path: + user = os.getenv("USER") + if Path("/checkpoint/").is_dir(): + p = Path(f"/checkpoint/{user}/experiments") + p.mkdir(exist_ok=True) + return p + raise RuntimeError("No shared folder available") + + +def get_init_file(): + # Init file must not exist, but it's parent dir must exist. + os.makedirs(str(get_shared_folder()), exist_ok=True) + init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init" + if init_file.exists(): + os.remove(str(init_file)) + return init_file + + +class Trainer(object): + def __init__(self, args): + self.args = args + + def __call__(self): + import main_finetune as classification + + self._setup_gpu_args() + classification.main(self.args) + + def checkpoint(self): + import os + import submitit + + self.args.dist_url = get_init_file().as_uri() + checkpoint_file = os.path.join(self.args.output_dir, "checkpoint.pth") + if os.path.exists(checkpoint_file): + self.args.resume = checkpoint_file + print("Requeuing ", self.args) + empty_trainer = type(self)(self.args) + return submitit.helpers.DelayedSubmission(empty_trainer) + + def _setup_gpu_args(self): + import submitit + from pathlib import Path + + job_env = submitit.JobEnvironment() + self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id))) + self.args.log_dir = self.args.output_dir + self.args.gpu = job_env.local_rank + self.args.rank = job_env.global_rank + self.args.world_size = job_env.num_tasks + print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") + + +def main(): + args = parse_args() + if args.job_dir == "": + args.job_dir = get_shared_folder() / "%j" + + # Note that the folder will depend on the job_id, to easily track experiments + executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) + + num_gpus_per_node = args.ngpus + nodes = args.nodes + timeout_min = args.timeout + + partition = args.partition + kwargs = {} + if args.use_volta32: + kwargs['slurm_constraint'] = 'volta32gb' + if args.comment: + kwargs['slurm_comment'] = args.comment + + executor.update_parameters( + mem_gb=40 * num_gpus_per_node, + gpus_per_node=num_gpus_per_node, + tasks_per_node=num_gpus_per_node, # one task per GPU + cpus_per_task=10, + nodes=nodes, + timeout_min=timeout_min, + # Below are cluster dependent parameters + slurm_partition=partition, + slurm_signal_delay_s=120, + **kwargs + ) + + executor.update_parameters(name="mae") + + args.dist_url = get_init_file().as_uri() + args.output_dir = args.job_dir + + trainer = Trainer(args) + job = executor.submit(trainer) + + # print("Submitted job_id:", job.job_id) + print(job.job_id) + + +if __name__ == "__main__": + main() diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/submitit_linprobe.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/submitit_linprobe.py new file mode 100644 index 0000000000000000000000000000000000000000..571186d3de27c68933a5a009206d793840f51da6 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/submitit_linprobe.py @@ -0,0 +1,131 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# A script to run multinode training with submitit. +# -------------------------------------------------------- + +import argparse +import os +import uuid +from pathlib import Path + +import main_linprobe as classification +import submitit + + +def parse_args(): + classification_parser = classification.get_args_parser() + parser = argparse.ArgumentParser("Submitit for MAE linear probe", parents=[classification_parser]) + parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node") + parser.add_argument("--nodes", default=2, type=int, help="Number of nodes to request") + parser.add_argument("--timeout", default=4320, type=int, help="Duration of the job") + parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.") + + parser.add_argument("--partition", default="learnfair", type=str, help="Partition where to submit") + parser.add_argument("--use_volta32", action='store_true', help="Request 32G V100 GPUs") + parser.add_argument('--comment', default="", type=str, help="Comment to pass to scheduler") + return parser.parse_args() + + +def get_shared_folder() -> Path: + user = os.getenv("USER") + if Path("/checkpoint/").is_dir(): + p = Path(f"/checkpoint/{user}/experiments") + p.mkdir(exist_ok=True) + return p + raise RuntimeError("No shared folder available") + + +def get_init_file(): + # Init file must not exist, but it's parent dir must exist. + os.makedirs(str(get_shared_folder()), exist_ok=True) + init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init" + if init_file.exists(): + os.remove(str(init_file)) + return init_file + + +class Trainer(object): + def __init__(self, args): + self.args = args + + def __call__(self): + import main_linprobe as classification + + self._setup_gpu_args() + classification.main(self.args) + + def checkpoint(self): + import os + import submitit + + self.args.dist_url = get_init_file().as_uri() + checkpoint_file = os.path.join(self.args.output_dir, "checkpoint.pth") + if os.path.exists(checkpoint_file): + self.args.resume = checkpoint_file + print("Requeuing ", self.args) + empty_trainer = type(self)(self.args) + return submitit.helpers.DelayedSubmission(empty_trainer) + + def _setup_gpu_args(self): + import submitit + from pathlib import Path + + job_env = submitit.JobEnvironment() + self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id))) + self.args.log_dir = self.args.output_dir + self.args.gpu = job_env.local_rank + self.args.rank = job_env.global_rank + self.args.world_size = job_env.num_tasks + print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") + + +def main(): + args = parse_args() + if args.job_dir == "": + args.job_dir = get_shared_folder() / "%j" + + # Note that the folder will depend on the job_id, to easily track experiments + executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) + + num_gpus_per_node = args.ngpus + nodes = args.nodes + timeout_min = args.timeout + + partition = args.partition + kwargs = {} + if args.use_volta32: + kwargs['slurm_constraint'] = 'volta32gb' + if args.comment: + kwargs['slurm_comment'] = args.comment + + executor.update_parameters( + mem_gb=40 * num_gpus_per_node, + gpus_per_node=num_gpus_per_node, + tasks_per_node=num_gpus_per_node, # one task per GPU + cpus_per_task=10, + nodes=nodes, + timeout_min=timeout_min, + # Below are cluster dependent parameters + slurm_partition=partition, + slurm_signal_delay_s=120, + **kwargs + ) + + executor.update_parameters(name="mae") + + args.dist_url = get_init_file().as_uri() + args.output_dir = args.job_dir + + trainer = Trainer(args) + job = executor.submit(trainer) + + # print("Submitted job_id:", job.job_id) + print(job.job_id) + + +if __name__ == "__main__": + main() diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/submitit_pretrain.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/submitit_pretrain.py new file mode 100644 index 0000000000000000000000000000000000000000..384b8ad0e65359b656e104df664c4d88711ee49d --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/submitit_pretrain.py @@ -0,0 +1,131 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# A script to run multinode training with submitit. +# -------------------------------------------------------- + +import argparse +import os +import uuid +from pathlib import Path + +import main_pretrain as trainer +import submitit + + +def parse_args(): + trainer_parser = trainer.get_args_parser() + parser = argparse.ArgumentParser("Submitit for MAE pretrain", parents=[trainer_parser]) + parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node") + parser.add_argument("--nodes", default=2, type=int, help="Number of nodes to request") + parser.add_argument("--timeout", default=4320, type=int, help="Duration of the job") + parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.") + + parser.add_argument("--partition", default="learnfair", type=str, help="Partition where to submit") + parser.add_argument("--use_volta32", action='store_true', help="Request 32G V100 GPUs") + parser.add_argument('--comment', default="", type=str, help="Comment to pass to scheduler") + return parser.parse_args() + + +def get_shared_folder() -> Path: + user = os.getenv("USER") + if Path("/checkpoint/").is_dir(): + p = Path(f"/checkpoint/{user}/experiments") + p.mkdir(exist_ok=True) + return p + raise RuntimeError("No shared folder available") + + +def get_init_file(): + # Init file must not exist, but it's parent dir must exist. + os.makedirs(str(get_shared_folder()), exist_ok=True) + init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init" + if init_file.exists(): + os.remove(str(init_file)) + return init_file + + +class Trainer(object): + def __init__(self, args): + self.args = args + + def __call__(self): + import main_pretrain as trainer + + self._setup_gpu_args() + trainer.main(self.args) + + def checkpoint(self): + import os + import submitit + + self.args.dist_url = get_init_file().as_uri() + checkpoint_file = os.path.join(self.args.output_dir, "checkpoint.pth") + if os.path.exists(checkpoint_file): + self.args.resume = checkpoint_file + print("Requeuing ", self.args) + empty_trainer = type(self)(self.args) + return submitit.helpers.DelayedSubmission(empty_trainer) + + def _setup_gpu_args(self): + import submitit + from pathlib import Path + + job_env = submitit.JobEnvironment() + self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id))) + self.args.log_dir = self.args.output_dir + self.args.gpu = job_env.local_rank + self.args.rank = job_env.global_rank + self.args.world_size = job_env.num_tasks + print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") + + +def main(): + args = parse_args() + if args.job_dir == "": + args.job_dir = get_shared_folder() / "%j" + + # Note that the folder will depend on the job_id, to easily track experiments + executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) + + num_gpus_per_node = args.ngpus + nodes = args.nodes + timeout_min = args.timeout + + partition = args.partition + kwargs = {} + if args.use_volta32: + kwargs['slurm_constraint'] = 'volta32gb' + if args.comment: + kwargs['slurm_comment'] = args.comment + + executor.update_parameters( + mem_gb=40 * num_gpus_per_node, + gpus_per_node=num_gpus_per_node, + tasks_per_node=num_gpus_per_node, # one task per GPU + cpus_per_task=10, + nodes=nodes, + timeout_min=timeout_min, # max is 60 * 72 + # Below are cluster dependent parameters + slurm_partition=partition, + slurm_signal_delay_s=120, + **kwargs + ) + + executor.update_parameters(name="mae") + + args.dist_url = get_init_file().as_uri() + args.output_dir = args.job_dir + + trainer = Trainer(args) + job = executor.submit(trainer) + + # print("Submitted job_id:", job.job_id) + print(job.job_id) + + +if __name__ == "__main__": + main() diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/test_npid.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/test_npid.py new file mode 100644 index 0000000000000000000000000000000000000000..91c9711b060103e935bf8b6f906cd1afb26b8aa9 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/test_npid.py @@ -0,0 +1,138 @@ +import torch +import time +from lib.utils import AverageMeter + + +def NN(epoch, net, lemniscate, trainloader, testloader, recompute_memory=0): + net.eval() + net_time = AverageMeter() + cls_time = AverageMeter() + losses = AverageMeter() + correct = 0. + total = 0 + testsize = testloader.dataset.__len__() + + trainFeatures = lemniscate.memory.t() + if hasattr(trainloader.dataset, 'imgs'): + trainLabels = torch.LongTensor([y for (p, y) in trainloader.dataset.imgs]).cuda() + else: + trainLabels = torch.LongTensor(trainloader.dataset.train_labels).cuda() + + if recompute_memory: + transform_bak = trainloader.dataset.transform + trainloader.dataset.transform = testloader.dataset.transform + temploader = torch.utils.data.DataLoader(trainloader.dataset, batch_size=300, shuffle=False, num_workers=20) + for batch_idx, (inputs, targets, indexes) in enumerate(temploader): + targets = targets.cuda(non_blocking = True) + inputs = inputs.cuda(non_blocking = True) + batchSize = inputs.size(0) + features = net(inputs, npid_feature = True) + trainFeatures[:, batch_idx*batchSize:batch_idx*batchSize+batchSize] = features.data.t() + trainLabels = torch.LongTensor(temploader.dataset.targets).cuda() + trainloader.dataset.transform = transform_bak + + end = time.time() + with torch.no_grad(): + for batch_idx, (inputs, targets, indexes) in enumerate(testloader): + targets = targets.cuda(non_blocking = True) + batchSize = inputs.size(0) + inputs = inputs.cuda(non_blocking = True) + features = net(inputs, npid_feature = True) + net_time.update(time.time() - end) + end = time.time() + + dist = torch.mm(features, trainFeatures) + + yd, yi = dist.topk(1, dim=1, largest=True, sorted=True) + candidates = trainLabels.view(1,-1).expand(batchSize, -1) + retrieval = torch.gather(candidates, 1, yi) + + retrieval = retrieval.narrow(1, 0, 1).clone().view(-1) + yd = yd.narrow(1, 0, 1) + + total += targets.size(0) + correct += retrieval.eq(targets.data).sum().item() + + cls_time.update(time.time() - end) + end = time.time() + + print('Test [{}/{}]\t' + 'Net Time {net_time.val:.3f} ({net_time.avg:.3f})\t' + 'Cls Time {cls_time.val:.3f} ({cls_time.avg:.3f})\t' + 'Top1: {:.2f}'.format( + total, testsize, correct*100./total, net_time=net_time, cls_time=cls_time)) + + return correct/total + +def kNN(epoch, net, lemniscate, trainloader, testloader, K, sigma, recompute_memory=0): + net.eval() + net_time = AverageMeter() + cls_time = AverageMeter() + total = 0 + testsize = testloader.dataset.__len__() + + trainFeatures = lemniscate.memory.t() + if hasattr(trainloader.dataset, 'imgs'): + trainLabels = torch.LongTensor([y for (p, y) in trainloader.dataset.imgs]).cuda() + else: + trainLabels = torch.LongTensor(trainloader.dataset.train_labels).cuda() + C = trainLabels.max() + 1 + + if recompute_memory: + transform_bak = trainloader.dataset.transform + trainloader.dataset.transform = testloader.dataset.transform + temploader = torch.utils.data.DataLoader(trainloader.dataset, batch_size=300, shuffle=False, num_workers=20) + for batch_idx, (inputs, targets, indexes) in enumerate(temploader): + targets = targets.cuda(non_blocking = True) + inputs = inputs.cuda(non_blocking = True) + batchSize = inputs.size(0) + features = net(inputs, npid_feature = True) + trainFeatures[:, batch_idx*batchSize:batch_idx*batchSize+batchSize] = features.data.t() + trainLabels = torch.LongTensor(temploader.dataset.targets).cuda() + trainloader.dataset.transform = transform_bak + + top1 = 0. + top5 = 0. + end = time.time() + with torch.no_grad(): + retrieval_one_hot = torch.zeros(K, C).cuda() #[200, 2] + for batch_idx, (inputs, targets, indexes) in enumerate(testloader): + end = time.time() + targets = targets.cuda(non_blocking = True) + inputs = inputs.cuda(non_blocking = True) + batchSize = inputs.size(0) + features = net(inputs, npid_feature = True) #[128, 768] + net_time.update(time.time() - end) + end = time.time() + + dist = torch.mm(features, trainFeatures) #[128, 22000] + + yd, yi = dist.topk(K, dim=1, largest=True, sorted=True) #[128, 200] + candidates = trainLabels.view(1,-1).expand(batchSize, -1) #[128, 22000] + retrieval = torch.gather(candidates, 1, yi) #[128, 200] + + retrieval_one_hot.resize_(batchSize * K, C).zero_() #[25600, 2] + retrieval_one_hot.scatter_(1, retrieval.view(-1, 1), 1) + yd_transform = yd.clone().div_(sigma).exp_() + probs = torch.sum(torch.mul(retrieval_one_hot.view(batchSize, -1 , C), yd_transform.view(batchSize, -1, 1)), 1) #[128, 2] + _, predictions = probs.sort(1, True) #取前k个,0和1的概率分别相加后选最大 + + # Find which predictions match the target + correct = predictions.eq(targets.data.view(-1,1)) + cls_time.update(time.time() - end) + + top1 = top1 + correct.narrow(1,0,1).sum().item() + #top5 = top5 + correct.narrow(1,0,5).sum().item() + + total += targets.size(0) + + print('Test [{}/{}]\t' + 'Net Time {net_time.val:.3f} ({net_time.avg:.3f})\t' + 'Cls Time {cls_time.val:.3f} ({cls_time.avg:.3f})\t' + 'Top1: {:.2f} Top5: {:.2f}'.format( + total, testsize, top1*100./total, top5*100./total, net_time=net_time, cls_time=cls_time)) + + print(top1*100./total) + + return top1/total + diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/tsne.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/tsne.py new file mode 100644 index 0000000000000000000000000000000000000000..4ee864a777f61b039147cced9ba48e22f10afb81 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/tsne.py @@ -0,0 +1,183 @@ +import argparse +from random import shuffle +import numpy as np +import os +from pathlib import Path + +import torch +import torch.backends.cudnn as cudnn +import torchvision.transforms as transforms +import torchvision.datasets as datasets + +import timm + +assert timm.__version__ == "0.3.2" # version check +from timm.models.layers import trunc_normal_ + +import util.misc as misc +from util.pos_embed import interpolate_pos_embed +import models_encoder +from sklearn.manifold import TSNE +import matplotlib.pyplot as plt +def get_args_parser(): + parser = argparse.ArgumentParser('GCMAE feature representation visual', add_help=False) + # Model parameters + parser.add_argument('--model', default='vit_base_patch16', type=str, metavar='MODEL', + help='Name of model to train') + + parser.add_argument('--batch_size', default='128', type=int, + help='batch size') + # * Finetuning params + parser.add_argument('--random', default=False, + help='random init only') + ### mae + # camelyon/pre + # nctcrc/pre + ###gcmae + # camelyon/pre + # nctcrc/pre + parser.add_argument('--finetune', default='', + help='finetune from checkpoint') + parser.add_argument('--save_path', default='') + parser.add_argument('--data_path_val', default='', type=str, + help='dataset val path') + + parser.add_argument('--global_pool', action='store_true') + parser.set_defaults(global_pool=True) + parser.add_argument('--cls_token', action='store_false', dest='global_pool', + help='Use class token instead of global pool for classification') + + parser.add_argument('--device', default='cuda', + help='device to use for training / testing') + parser.add_argument('--seed', default=0, type=int) + + parser.add_argument('--num_workers', default=20, type=int) + parser.add_argument('--pin_mem', action='store_true', + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') + parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') + parser.set_defaults(pin_mem=True) + + # distributed training parameters + parser.add_argument('--gpu_id', default=0, type=int, + help="the order of gpu") + return parser +def main(args): + torch.cuda.set_device(args.gpu_id) + + print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__)))) + print("{}".format(args).replace(', ', ',\n')) + + device = torch.device(args.device) + + # fix the seed for reproducibility + seed = args.seed + misc.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + + cudnn.benchmark = True + + # weak augmentation + transform_val = transforms.Compose([ + transforms.Resize(256, interpolation=3), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize(mean=[0.6790435, 0.5052883, 0.66902906], std= [0.19158737, 0.2039779, 0.15648715])]) + + dataset_val = datasets.ImageFolder(args.data_path_val, transform=transform_val) + print(dataset_val) + + + data_loader_val = torch.utils.data.DataLoader( + dataset_val, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=args.pin_mem, + drop_last=False + ) + + model = models_encoder.__dict__[args.model]( + global_pool=args.global_pool, + ) + + if args.finetune and not args.random: + checkpoint = torch.load(args.finetune, map_location='cpu') + + print("Load pre-trained checkpoint from: %s" % args.finetune) + checkpoint_model = checkpoint['model'] + state_dict = model.state_dict() + for k in ['head.weight', 'head.bias']: + if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape: + print(f"Removing key {k} from pretrained checkpoint") + del checkpoint_model[k] + + # interpolate position embedding + interpolate_pos_embed(model, checkpoint_model) + + # load pre-trained model + msg = model.load_state_dict(checkpoint_model, strict=False) + print(msg) + + if args.global_pool: + assert set(msg.missing_keys) == {'head.weight', 'head.bias', 'fc_norm.weight', 'fc_norm.bias'} + else: + assert set(msg.missing_keys) == {'head.weight', 'head.bias'} + + # manually initialize fc layer: following MoCo v3 + trunc_normal_(model.head.weight, std=0.01) + + # for linear prob only + # hack: revise model's head with BN + model.head = torch.nn.Sequential(torch.nn.BatchNorm1d(model.head.in_features, affine=False, eps=1e-6), model.head) + # freeze all but the head + for _, p in model.named_parameters(): + p.requires_grad = False + for _, p in model.head.named_parameters(): + p.requires_grad = True + + model.to(device) + evaluate(data_loader_val, model, device) + +def evaluate(data_loader, model, device): + t = TSNE(n_components=2, init='pca', random_state=0, perplexity=30, n_iter=5000) + + metric_logger = misc.MetricLogger(delimiter=" ") + header = 'Test:' + + # switch to evaluation mode + model.eval() + output_full = [] + target_full = [] + for batch in metric_logger.log_every(data_loader, 10, header): + images = batch[0] + target = batch[-1] + images = images.to(device, non_blocking=True) + target = target.to(device, non_blocking=True) + + # compute output + with torch.cuda.amp.autocast(): + output = model(images) + output_full += output.cpu().numpy().tolist() + target_full += target.cpu().numpy().tolist() + output_full = np.array(output_full) + target_full = np.array(target_full) + t = t.fit_transform(output_full) + + + x_min, x_max = t.min(0), t.max(0) + print("x_min:{}./n\ + x_max:{}".format(x_min, x_max)) + X_norm = (t - x_min) / (x_max - x_min) + print("X_norm shape:{}".format(X_norm.shape)) + + + plt.figure(figsize=(16, 16)) + for i in range(X_norm.shape[0]): + plt.text(X_norm[i, 0], X_norm[i, 1], str(target_full[i]), color=plt.cm.Set1(target_full[i]), fontdict={'weight': 'bold', 'size': 9}) + plt.xticks([]) + plt.yticks([]) + plt.savefig(args.save_path) + plt.show() +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + main(args) \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/__pycache__/lr_sched.cpython-38.pyc b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/__pycache__/lr_sched.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8f29661d0c1408023371be5037f4838661f2171 Binary files /dev/null and b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/__pycache__/lr_sched.cpython-38.pyc differ diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/__pycache__/misc.cpython-38.pyc b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/__pycache__/misc.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3cce271f36485e84a65a894adab623904c38d95f Binary files /dev/null and b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/__pycache__/misc.cpython-38.pyc differ diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/__pycache__/pos_embed.cpython-38.pyc b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/__pycache__/pos_embed.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7636cc26f471927dfbda16d20fb1b9e6eb461577 Binary files /dev/null and b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/__pycache__/pos_embed.cpython-38.pyc differ diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/crop.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/crop.py new file mode 100644 index 0000000000000000000000000000000000000000..bb2d97faf5156231c27d599b7799fef96e2cab27 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/crop.py @@ -0,0 +1,42 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch + +from torchvision import transforms +from torchvision.transforms import functional as F + + +class RandomResizedCrop(transforms.RandomResizedCrop): + """ + RandomResizedCrop for matching TF/TPU implementation: no for-loop is used. + This may lead to results different with torchvision's version. + Following BYOL's TF code: + https://github.com/deepmind/deepmind-research/blob/master/byol/utils/dataset.py#L206 + """ + @staticmethod + def get_params(img, scale, ratio): + width, height = F.get_image_size(img) + area = height * width + + target_area = area * torch.empty(1).uniform_(scale[0], scale[1]).item() + log_ratio = torch.log(torch.tensor(ratio)) + aspect_ratio = torch.exp( + torch.empty(1).uniform_(log_ratio[0], log_ratio[1]) + ).item() + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + w = min(w, width) + h = min(h, height) + + i = torch.randint(0, height - h + 1, size=(1,)).item() + j = torch.randint(0, width - w + 1, size=(1,)).item() + + return i, j, h, w \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/datasets.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..5abc8dc22a968378c6e062ff0f2e7bec11bc1f8a --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/datasets.py @@ -0,0 +1,67 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- + +import os +import PIL + +from torchvision import datasets, transforms + +from timm.data import create_transform +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD + + +def build_dataset(is_train, args): + transform = build_transform(is_train, args) + + root = os.path.join(args.data_path, 'train' if is_train else 'test') + dataset = datasets.ImageFolder(root, transform=transform) + + print(dataset) + + return dataset + + +def build_transform(is_train, args): + # mean = IMAGENET_DEFAULT_MEAN + # std = IMAGENET_DEFAULT_STD + wsi_mean = (0.6790435, 0.5052883, 0.66902906) + wsi_std = (0.19158737, 0.2039779, 0.15648715) + # train transform + if is_train: + # this should always dispatch to transforms_imagenet_train + transform = create_transform( + input_size=args.input_size, + is_training=True, + color_jitter=args.color_jitter, + auto_augment=args.aa, + interpolation='bicubic', + re_prob=args.reprob, + re_mode=args.remode, + re_count=args.recount, + mean=wsi_mean, + std=wsi_std, + ) + return transform + + # eval transform + t = [] + if args.input_size <= 224: + crop_pct = 224 / 256 + else: + crop_pct = 1.0 + size = int(args.input_size / crop_pct) + t.append( + transforms.Resize(size, interpolation=PIL.Image.BICUBIC), # to maintain same ratio w.r.t. 224 images + ) + t.append(transforms.CenterCrop(args.input_size)) + + t.append(transforms.ToTensor()) + t.append(transforms.Normalize(wsi_mean, wsi_std)) + return transforms.Compose(t) diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/lars.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/lars.py new file mode 100644 index 0000000000000000000000000000000000000000..509c5f65b7f68423343121d5676d05ce32d5a6c0 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/lars.py @@ -0,0 +1,47 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# LARS optimizer, implementation from MoCo v3: +# https://github.com/facebookresearch/moco-v3 +# -------------------------------------------------------- + +import torch + + +class LARS(torch.optim.Optimizer): + """ + LARS optimizer, no rate scaling or weight decay for parameters <= 1D. + """ + def __init__(self, params, lr=0, weight_decay=0, momentum=0.9, trust_coefficient=0.001): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, trust_coefficient=trust_coefficient) + super().__init__(params, defaults) + + @torch.no_grad() + def step(self): + for g in self.param_groups: + for p in g['params']: + dp = p.grad + + if dp is None: + continue + + if p.ndim > 1: # if not normalization gamma/beta or bias + dp = dp.add(p, alpha=g['weight_decay']) + param_norm = torch.norm(p) + update_norm = torch.norm(dp) + one = torch.ones_like(param_norm) + q = torch.where(param_norm > 0., + torch.where(update_norm > 0, + (g['trust_coefficient'] * param_norm / update_norm), one), + one) + dp = dp.mul(q) + + param_state = self.state[p] + if 'mu' not in param_state: + param_state['mu'] = torch.zeros_like(p) + mu = param_state['mu'] + mu.mul_(g['momentum']).add_(dp) + p.add_(mu, alpha=-g['lr']) \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/lr_decay.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/lr_decay.py new file mode 100644 index 0000000000000000000000000000000000000000..7fa11f1c581190f1a93c24c7dedde60804ab980b --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/lr_decay.py @@ -0,0 +1,76 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# ELECTRA https://github.com/google-research/electra +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- + +import json + + +def param_groups_lrd(model, weight_decay=0.05, no_weight_decay_list=[], layer_decay=.75): + """ + Parameter groups for layer-wise lr decay + Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58 + """ + param_group_names = {} + param_groups = {} + + num_layers = len(model.blocks) + 1 + + layer_scales = list(layer_decay ** (num_layers - i) for i in range(num_layers + 1)) + + for n, p in model.named_parameters(): + if not p.requires_grad: + continue + + # no decay: all 1D parameters and model specific ones + if p.ndim == 1 or n in no_weight_decay_list: + g_decay = "no_decay" + this_decay = 0. + else: + g_decay = "decay" + this_decay = weight_decay + + layer_id = get_layer_id_for_vit(n, num_layers) + group_name = "layer_%d_%s" % (layer_id, g_decay) + + if group_name not in param_group_names: + this_scale = layer_scales[layer_id] + + param_group_names[group_name] = { + "lr_scale": this_scale, + "weight_decay": this_decay, + "params": [], + } + param_groups[group_name] = { + "lr_scale": this_scale, + "weight_decay": this_decay, + "params": [], + } + + param_group_names[group_name]["params"].append(n) + param_groups[group_name]["params"].append(p) + + # print("parameter groups: \n%s" % json.dumps(param_group_names, indent=2)) + + return list(param_groups.values()) + + +def get_layer_id_for_vit(name, num_layers): + """ + Assign a parameter with its layer id + Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33 + """ + if name in ['cls_token', 'pos_embed']: + return 0 + elif name.startswith('patch_embed'): + return 0 + elif name.startswith('blocks'): + return int(name.split('.')[1]) + 1 + else: + return num_layers \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/lr_sched.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/lr_sched.py new file mode 100644 index 0000000000000000000000000000000000000000..4cb682bebbce25ea1df70119928faa5fc9a6ab02 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/lr_sched.py @@ -0,0 +1,21 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math + +def adjust_learning_rate(optimizer, epoch, args): + """Decay the learning rate with half-cycle cosine after warmup""" + if epoch < args.warmup_epochs: + lr = args.lr * epoch / args.warmup_epochs + else: + lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \ + (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs))) + for param_group in optimizer.param_groups: + if "lr_scale" in param_group: + param_group["lr"] = lr * param_group["lr_scale"] + else: + param_group["lr"] = lr + return lr diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/misc.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..3bd53543176d31c408907e4eea6895a39231b0b1 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/misc.py @@ -0,0 +1,345 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# DeiT: https://github.com/facebookresearch/deit +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- + +import builtins +import datetime +import os +import time +from collections import defaultdict, deque +from pathlib import Path + +import torch +import torch.distributed as dist +from torch._six import inf + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + if not is_dist_avail_and_initialized(): + return + t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') + dist.barrier() + dist.all_reduce(t) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value) + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if v is None: + continue + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append( + "{}: {}".format(name, str(meter)) + ) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every(self, iterable, print_freq, header=None): + i = 0 + if not header: + header = '' + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt='{avg:.4f}') + data_time = SmoothedValue(fmt='{avg:.4f}') + space_fmt = ':' + str(len(str(len(iterable)))) + 'd' + log_msg = [ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}' + ] + if torch.cuda.is_available(): + log_msg.append('max mem: {memory:.0f}') + log_msg = self.delimiter.join(log_msg) + MB = 1024.0 * 1024.0 + for obj in iterable: + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if i % print_freq == 0 or i == len(iterable) - 1: + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + if torch.cuda.is_available(): + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB)) + else: + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time))) + i += 1 + end = time.time() + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('{} Total time: {} ({:.4f} s / it)'.format( + header, total_time_str, total_time / len(iterable))) + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + builtin_print = builtins.print + + def print(*args, **kwargs): + force = kwargs.pop('force', False) + force = force or (get_world_size() > 8) + if is_master or force: + now = datetime.datetime.now().time() + builtin_print('[{}] '.format(now), end='') # print with time stamp + builtin_print(*args, **kwargs) + + builtins.print = print + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(*args, **kwargs): + if is_main_process(): + torch.save(*args, **kwargs) + + +def init_distributed_mode(args): + # # altered: disable DDP + # print('Not using distributed mode') + # setup_for_distributed(is_master=True) # hack + # args.distributed = False + # return + if args.dist_on_itp: + args.rank = int(os.environ['OMPI_COMM_WORLD_RANK']) + args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) + args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) + args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT']) + os.environ['LOCAL_RANK'] = str(args.gpu) + os.environ['RANK'] = str(args.rank) + os.environ['WORLD_SIZE'] = str(args.world_size) + # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"] + elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ['WORLD_SIZE']) + args.gpu = int(os.environ['LOCAL_RANK']) + elif 'SLURM_PROCID' in os.environ: + args.rank = int(os.environ['SLURM_PROCID']) + args.gpu = args.rank % torch.cuda.device_count() + else: + print('Not using distributed mode') + setup_for_distributed(is_master=True) # hack + args.distributed = False + return + + args.distributed = True + + torch.cuda.set_device(args.gpu) + args.dist_backend = 'nccl' + print('| distributed init (rank {}): {}, gpu {}'.format( + args.rank, args.dist_url, args.gpu), flush=True) + torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + torch.distributed.barrier() + setup_for_distributed(args.rank == 0) + + +class NativeScalerWithGradNormCount: + state_dict_key = "amp_scaler" + + def __init__(self): + self._scaler = torch.cuda.amp.GradScaler() + + def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True): + self._scaler.scale(loss).backward(create_graph=create_graph) + if update_grad: + if clip_grad is not None: + assert parameters is not None + self._scaler.unscale_(optimizer) # unscale the gradients of optimizer's assigned params in-place + norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad) + else: + self._scaler.unscale_(optimizer) + norm = get_grad_norm_(parameters) + self._scaler.step(optimizer) + self._scaler.update() + else: + norm = None + return norm + + def state_dict(self): + return self._scaler.state_dict() + + def load_state_dict(self, state_dict): + self._scaler.load_state_dict(state_dict) + + +def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor: + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = [p for p in parameters if p.grad is not None] + norm_type = float(norm_type) + if len(parameters) == 0: + return torch.tensor(0.) + device = parameters[0].grad.device + if norm_type == inf: + total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters) + else: + total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) + return total_norm + + +def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler): + output_dir = Path(args.output_dir) + epoch_name = str(epoch) + if loss_scaler is not None: + checkpoint_paths = [output_dir / ('checkpoint-%s.pth' % epoch_name)] + for checkpoint_path in checkpoint_paths: + to_save = { + 'model': model_without_ddp.state_dict(), + 'optimizer': optimizer.state_dict(), + 'epoch': epoch, + 'scaler': loss_scaler.state_dict(), + 'args': args, + } + + save_on_master(to_save, checkpoint_path) + else: + client_state = {'epoch': epoch} + model.save_checkpoint(save_dir=args.output_dir, tag="checkpoint-%s" % epoch_name, client_state=client_state) + + +def load_model(args, model_without_ddp, optimizer, loss_scaler): + if args.resume: + if args.resume.startswith('https'): + checkpoint = torch.hub.load_state_dict_from_url( + args.resume, map_location='cpu', check_hash=True) + else: + checkpoint = torch.load(args.resume, map_location='cpu') + model_without_ddp.load_state_dict(checkpoint['model']) + print("Resume checkpoint %s" % args.resume) + if 'optimizer' in checkpoint and 'epoch' in checkpoint and not (hasattr(args, 'eval') and args.eval): + optimizer.load_state_dict(checkpoint['optimizer']) + args.start_epoch = checkpoint['epoch'] + 1 + if 'scaler' in checkpoint: + loss_scaler.load_state_dict(checkpoint['scaler']) + print("With optim & sched!") + + +def all_reduce_mean(x): + world_size = get_world_size() + if world_size > 1: + x_reduce = torch.tensor(x).cuda() + dist.all_reduce(x_reduce) + x_reduce /= world_size + return x_reduce.item() + else: + return x \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/pos_embed.py b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/pos_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..bcafafe0f75d9bbc16827d2432d84d8f8a1e2709 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/gcmae/util/pos_embed.py @@ -0,0 +1,96 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# Position embedding utils +# -------------------------------------------------------- + +import numpy as np + +import torch + +# -------------------------------------------------------- +# 2D sine-cosine position embedding +# References: +# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py +# MoCo v3: https://github.com/facebookresearch/moco-v3 +# -------------------------------------------------------- +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + grid_h = np.arange(grid_size, dtype=np.float32) + grid_w = np.arange(grid_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size, grid_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2. + omega = 1. / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +# -------------------------------------------------------- +# Interpolate position embeddings for high-resolution +# References: +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- +def interpolate_pos_embed(model, checkpoint_model): + if 'pos_embed' in checkpoint_model: + pos_embed_checkpoint = checkpoint_model['pos_embed'] + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = model.patch_embed.num_patches + num_extra_tokens = model.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches ** 0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + checkpoint_model['pos_embed'] = new_pos_embed diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/CODE_OF_CONDUCT.md b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..08b500a221857ec3f451338e80b4a9ab1173a1af --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/CODE_OF_CONDUCT.md @@ -0,0 +1,80 @@ +# Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to make participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all project spaces, and it also applies when +an individual is representing the project or its community in public spaces. +Examples of representing a project or community include using an official +project e-mail address, posting via an official social media account, or acting +as an appointed representative at an online or offline event. Representation of +a project may be further defined and clarified by project maintainers. + +This Code of Conduct also applies outside the project spaces when there is a +reasonable belief that an individual's behavior may have a negative impact on +the project or its community. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at . All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/CONFIG.md b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/CONFIG.md new file mode 100644 index 0000000000000000000000000000000000000000..ebd0dc7633bc5bbae4557bac0cbf467909d96aca --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/CONFIG.md @@ -0,0 +1,210 @@ +## MoCo v3 Reference Setups and Models + +Here we document the reference commands for pre-training and evaluating various MoCo v3 models. + +### ResNet-50 models + +With batch 4096, the training of all ResNet-50 models can fit into 2 nodes with a total of 16 Volta 32G GPUs. + +
+ResNet-50, 100-epoch pre-training. + +On the first node, run: +``` +python main_moco.py \ + --moco-m-cos --crop-min=.2 \ + --dist-url 'tcp://[your first node address]:[specified port]' \ + --multiprocessing-distributed --world-size 2 --rank 0 \ + [your imagenet-folder with train and val folders] +``` +On the second node, run the same command with `--rank 1`. +
+ +
+ResNet-50, 300-epoch pre-training. + +On the first node, run: +``` +python main_moco.py \ + --lr=.3 --epochs=300 \ + --moco-m-cos --crop-min=.2 \ + --dist-url 'tcp://[your first node address]:[specified port]' \ + --multiprocessing-distributed --world-size 2 --rank 0 \ + [your imagenet-folder with train and val folders] +``` +On the second node, run the same command with `--rank 1`. +
+ +
+ResNet-50, 1000-epoch pre-training. + +On the first node, run: +``` +python main_moco.py \ + --lr=.3 --wd=1.5e-6 --epochs=1000 \ + --moco-m=0.996 --moco-m-cos --crop-min=.2 \ + --dist-url 'tcp://[your first node address]:[specified port]' \ + --multiprocessing-distributed --world-size 2 --rank 0 \ + [your imagenet-folder with train and val folders] +``` +On the second node, run the same command with `--rank 1`. +
+ +
+ResNet-50, linear classification. + +Run on single node: +``` +python main_lincls.py \ + --dist-url 'tcp://localhost:10001' \ + --multiprocessing-distributed --world-size 1 --rank 0 \ + --pretrained [your checkpoint path]/[your checkpoint file].pth.tar \ + [your imagenet-folder with train and val folders] +``` +
+ +Below are our pre-trained ResNet-50 models and logs. + + + + + + + + + + + + + + + + + + + + + + + + + + + +
pretrain
epochs
linear
acc
pretrain
files
linear
files
10068.9chptchpt / + log
30072.8chptchpt / + log
100074.6chptchpt / + log
+ + +### ViT Models + +All ViT models are pre-trained for 300 epochs with AdamW. + +
+ViT-Small, 1-node (8-GPU), 1024-batch pre-training. + +This setup fits into a single node of 8 Volta 32G GPUs, for ease of debugging. +``` +python main_moco.py \ + -a vit_small -b 1024 \ + --optimizer=adamw --lr=1.5e-4 --weight-decay=.1 \ + --epochs=300 --warmup-epochs=40 \ + --stop-grad-conv1 --moco-m-cos --moco-t=.2 \ + --dist-url 'tcp://localhost:10001' \ + --multiprocessing-distributed --world-size 1 --rank 0 \ + [your imagenet-folder with train and val folders] +``` + +
+ +
+ViT-Small, 4-node (32-GPU) pre-training. + +On the first node, run: +``` +python main_moco.py \ + -a vit_small \ + --optimizer=adamw --lr=1.5e-4 --weight-decay=.1 \ + --epochs=300 --warmup-epochs=40 \ + --stop-grad-conv1 --moco-m-cos --moco-t=.2 \ + --dist-url 'tcp://[your first node address]:[specified port]' \ + --multiprocessing-distributed --world-size 8 --rank 0 \ + [your imagenet-folder with train and val folders] +``` +On other nodes, run the same command with `--rank 1`, ..., `--rank 3` respectively. +
+ +
+ViT-Small, linear classification. + +Run on single node: +``` +python main_lincls.py \ + -a vit_small --lr=3 \ + --dist-url 'tcp://localhost:10001' \ + --multiprocessing-distributed --world-size 1 --rank 0 \ + --pretrained [your checkpoint path]/[your checkpoint file].pth.tar \ + [your imagenet-folder with train and val folders] +``` +
+ +
+ViT-Base, 8-node (64-GPU) pre-training. + +``` +python main_moco.py \ + -a vit_base \ + --optimizer=adamw --lr=1.5e-4 --weight-decay=.1 \ + --epochs=300 --warmup-epochs=40 \ + --stop-grad-conv1 --moco-m-cos --moco-t=.2 \ + --dist-url 'tcp://[your first node address]:[specified port]' \ + --multiprocessing-distributed --world-size 8 --rank 0 \ + [your imagenet-folder with train and val folders] +``` +On other nodes, run the same command with `--rank 1`, ..., `--rank 7` respectively. +
+ +
+ViT-Base, linear classification. + +Run on single node: +``` +python main_lincls.py \ + -a vit_base --lr=3 \ + --dist-url 'tcp://localhost:10001' \ + --multiprocessing-distributed --world-size 1 --rank 0 \ + --pretrained [your checkpoint path]/[your checkpoint file].pth.tar \ + [your imagenet-folder with train and val folders] +``` +
+ + +Below are our pre-trained ViT models and logs (batch 4096). + + + + + + + + + + + + + + + + + + + + + + + + +
modelpretrain
epochs
linear
acc
pretrain
files
linear
files
ViT-Small30073.2chptchpt / + log
ViT-Base30076.7chptchpt / + log
diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/CONTRIBUTING.md b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..dc46d7ca4fdeb197e02755f98f986719ac084381 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/CONTRIBUTING.md @@ -0,0 +1,31 @@ +# Contributing to moco-v3 +We want to make contributing to this project as easy and transparent as +possible. + +## Pull Requests +We actively welcome your pull requests. + +1. Fork the repo and create your branch from `master`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. +5. Make sure your code lints. +6. If you haven't already, complete the Contributor License Agreement ("CLA"). + +## Contributor License Agreement ("CLA") +In order to accept your pull request, we need you to submit a CLA. You only need +to do this once to work on any of Facebook's open source projects. + +Complete your CLA here: + +## Issues +We use GitHub issues to track public bugs. Please ensure your description is +clear and has sufficient instructions to be able to reproduce the issue. + +Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe +disclosure of security bugs. In those cases, please go through the process +outlined on that page and do not file a public issue. + +## License +By contributing to moco-v3, you agree that your contributions will be licensed +under the LICENSE file in the root directory of this source tree. \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/LICENSE b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..105a4fb33f75de2fc37c4bd73b7952e19602b589 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/LICENSE @@ -0,0 +1,399 @@ +Attribution-NonCommercial 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More_considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution-NonCommercial 4.0 International Public +License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution-NonCommercial 4.0 International Public License ("Public +License"). To the extent this Public License may be interpreted as a +contract, You are granted the Licensed Rights in consideration of Your +acceptance of these terms and conditions, and the Licensor grants You +such rights in consideration of benefits the Licensor receives from +making the Licensed Material available under these terms and +conditions. + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + d. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + e. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + f. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + g. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + h. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + i. NonCommercial means not primarily intended for or directed towards + commercial advantage or monetary compensation. For purposes of + this Public License, the exchange of the Licensed Material for + other material subject to Copyright and Similar Rights by digital + file-sharing or similar means is NonCommercial provided there is + no payment of monetary compensation in connection with the + exchange. + + j. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + k. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + l. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part, for NonCommercial purposes only; and + + b. produce, reproduce, and Share Adapted Material for + NonCommercial purposes only. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties, including when + the Licensed Material is used other than for NonCommercial + purposes. + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + 4. If You Share Adapted Material You produce, the Adapter's + License You apply must not prevent recipients of the Adapted + Material from complying with this Public License. + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database for NonCommercial purposes + only; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material; and + + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/README.md b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/README.md new file mode 100644 index 0000000000000000000000000000000000000000..af75ff4c58def55959be476e8928e420b08004c2 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/README.md @@ -0,0 +1,15 @@ +## MoCo v3 for Self-supervised ResNet and ViT + +The original repo of MoCo-v3 could be found [here](https://github.com/facebookresearch/moco-v3) + +Pip requirements: timm == 0.4.9, PyTorch == 1.9.0, Torchvision == 0.10.0, Cuda == 10.2, Numpy == 1.19 + +Typical BASH: + ```console +python main_moco.py \ + -a vit_base -b 512\ + --optimizer=adamw --lr=1.5e-4 --weight-decay=.1 \ + --epochs=100 --warmup-epochs=20 \ + --stop-grad-conv1 --moco-m-cos --moco-t=.2 --dist-url 'tcp://localhost:10001' \ + --multiprocessing-distributed --world-size 1 --rank 0 --basic_state_dict the/path/of/CPIA + ``` diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/__pycache__/vits.cpython-39.pyc b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/__pycache__/vits.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..111dc8df11051824a6f387e7d710330d52dce6e8 Binary files /dev/null and b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/__pycache__/vits.cpython-39.pyc differ diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/convert_to_deit.py b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/convert_to_deit.py new file mode 100644 index 0000000000000000000000000000000000000000..b2a12f0e90e61180131a8a907dd6192f4d321c8b --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/convert_to_deit.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os +import torch + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Convert MoCo Pre-Traind Model to DEiT') + parser.add_argument('--input', default='', type=str, metavar='PATH', required=True, + help='path to moco pre-trained checkpoint') + parser.add_argument('--output', default='', type=str, metavar='PATH', required=True, + help='path to output checkpoint in DEiT format') + args = parser.parse_args() + print(args) + + # load input + checkpoint = torch.load(args.input, map_location="cpu") + state_dict = checkpoint['state_dict'] + for k in list(state_dict.keys()): + # retain only base_encoder up to before the embedding layer + if k.startswith('module.base_encoder') and not k.startswith('module.base_encoder.head'): + # remove prefix + state_dict[k[len("module.base_encoder."):]] = state_dict[k] + # delete renamed or unused k + del state_dict[k] + + # make output directory if necessary + output_dir = os.path.dirname(args.output) + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + # save to output + torch.save({'model': state_dict}, args.output) diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/main_lincls.py b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/main_lincls.py new file mode 100644 index 0000000000000000000000000000000000000000..807f416c1f29e7cef81f62add4f92b719850fdf3 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/main_lincls.py @@ -0,0 +1,524 @@ +#!/usr/bin/env python + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import builtins +import math +import os +import random +import shutil +import time +import warnings + +import torch +import torch.nn as nn +import torch.nn.parallel +import torch.backends.cudnn as cudnn +import torch.distributed as dist +import torch.optim +import torch.multiprocessing as mp +import torch.utils.data +import torch.utils.data.distributed +import torchvision.transforms as transforms +import torchvision.datasets as datasets +import torchvision.models as torchvision_models + +import vits + +torchvision_model_names = sorted(name for name in torchvision_models.__dict__ + if name.islower() and not name.startswith("__") + and callable(torchvision_models.__dict__[name])) + +model_names = ['vit_small', 'vit_base', 'vit_conv_small', 'vit_conv_base'] + torchvision_model_names + +parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') +parser.add_argument('--data', default='/Users/munros/ROSE/MARS_MIL', + help='path to dataset') +parser.add_argument('-a', '--arch', metavar='ARCH', default='vit_base', + choices=model_names, + help='model architecture: ' + + ' | '.join(model_names) + + ' (default: vit_base)') +parser.add_argument('-j', '--workers', default=0, type=int, metavar='N', + help='number of data loading workers (default: 32)') +parser.add_argument('--epochs', default=5, type=int, metavar='N', + help='number of total epochs to run') +parser.add_argument('--start-epoch', default=0, type=int, metavar='N', + help='manual epoch number (useful on restarts)') +parser.add_argument('-b', '--batch-size', default=4, type=int, + metavar='N', + help='mini-batch size (default: 1024), this is the total ' + 'batch size of all GPUs on all nodes when ' + 'using Data Parallel or Distributed Data Parallel') +parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, + metavar='LR', help='initial (base) learning rate', dest='lr') +parser.add_argument('--momentum', default=0.9, type=float, metavar='M', + help='momentum') +parser.add_argument('--wd', '--weight-decay', default=0., type=float, + metavar='W', help='weight decay (default: 0.)', + dest='weight_decay') +parser.add_argument('-p', '--print-freq', default=10, type=int, + metavar='N', help='print frequency (default: 10)') +parser.add_argument('--resume', default='', type=str, metavar='PATH', + help='path to latest checkpoint (default: none)') +parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', + help='evaluate model on validation set') +parser.add_argument('--world-size', default=-1, type=int, + help='number of nodes for distributed training') +parser.add_argument('--rank', default=-1, type=int, + help='node rank for distributed training') +parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, + help='url used to set up distributed training') +parser.add_argument('--dist-backend', default='nccl', type=str, + help='distributed backend') +parser.add_argument('--seed', default=None, type=int, + help='seed for initializing training. ') +parser.add_argument('--gpu', default=None, type=int, + help='GPU id to use.') +parser.add_argument('--multiprocessing-distributed', action='store_true', + help='Use multi-processing distributed training to launch ' + 'N processes per node, which has N GPUs. This is the ' + 'fastest way to use PyTorch for either single node or ' + 'multi node data parallel training') + +# additional configs: +parser.add_argument('--pretrained', default='/Users/munros/Desktop/moco_checkpoint_0000.pth.tar', type=str, + help='path to moco pretrained checkpoint') + +best_acc1 = 0 + + +def main(): + args = parser.parse_args() + + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) + cudnn.deterministic = True + warnings.warn('You have chosen to seed training. ' + 'This will turn on the CUDNN deterministic setting, ' + 'which can slow down your training considerably! ' + 'You may see unexpected behavior when restarting ' + 'from checkpoints.') + + if args.gpu is not None: + warnings.warn('You have chosen a specific GPU. This will completely ' + 'disable data parallelism.') + + if args.dist_url == "env://" and args.world_size == -1: + args.world_size = int(os.environ["WORLD_SIZE"]) + + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + + ngpus_per_node = torch.cuda.device_count() + if args.multiprocessing_distributed: + # Since we have ngpus_per_node processes per node, the total world_size + # needs to be adjusted accordingly + args.world_size = ngpus_per_node * args.world_size + # Use torch.multiprocessing.spawn to launch distributed processes: the + # main_worker process function + mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) + else: + # Simply call main_worker function + main_worker(args.gpu, ngpus_per_node, args) + + +def main_worker(gpu, ngpus_per_node, args): + global best_acc1 + args.gpu = gpu + + # suppress printing if not master + if args.multiprocessing_distributed and args.gpu != 0: + def print_pass(*args): + pass + builtins.print = print_pass + + if args.gpu is not None: + print("Use GPU: {} for training".format(args.gpu)) + + if args.distributed: + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + if args.multiprocessing_distributed: + # For multiprocessing distributed training, rank needs to be the + # global rank among all the processes + args.rank = args.rank * ngpus_per_node + gpu + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + torch.distributed.barrier() + # create model + print("=> creating model '{}'".format(args.arch)) + if args.arch.startswith('vit'): + model = vits.__dict__[args.arch]() + linear_keyword = 'head' + else: + model = torchvision_models.__dict__[args.arch]() + linear_keyword = 'fc' + + # freeze all layers but the last fc + for name, param in model.named_parameters(): + if name not in ['%s.weight' % linear_keyword, '%s.bias' % linear_keyword]: + param.requires_grad = False + # init the fc layer + getattr(model, linear_keyword).weight.data.normal_(mean=0.0, std=0.01) + getattr(model, linear_keyword).bias.data.zero_() + + # load from pre-trained, before DistributedDataParallel constructor + if args.pretrained: + if os.path.isfile(args.pretrained): + print("=> loading checkpoint '{}'".format(args.pretrained)) + checkpoint = torch.load(args.pretrained, map_location="cpu") + + # rename moco pre-trained keys + state_dict = checkpoint['state_dict'] + for k in list(state_dict.keys()): + # retain only base_encoder up to before the embedding layer + if k.startswith('module.base_encoder') and not k.startswith('module.base_encoder.%s' % linear_keyword): + # remove prefix + state_dict[k[len("module.base_encoder."):]] = state_dict[k] + # delete renamed or unused k + del state_dict[k] + + args.start_epoch = 0 + msg = model.load_state_dict(state_dict, strict=False) + assert set(msg.missing_keys) == {"%s.weight" % linear_keyword, "%s.bias" % linear_keyword} + + print("=> loaded pre-trained model '{}'".format(args.pretrained)) + else: + print("=> no checkpoint found at '{}'".format(args.pretrained)) + + # infer learning rate before changing batch size + init_lr = args.lr * args.batch_size / 256 + + if not torch.cuda.is_available(): + print('using CPU, this will be slow') + elif args.distributed: + # For multiprocessing distributed, DistributedDataParallel constructor + # should always set the single device scope, otherwise, + # DistributedDataParallel will use all available devices. + if args.gpu is not None: + torch.cuda.set_device(args.gpu) + model.cuda(args.gpu) + # When using a single GPU per process and per + # DistributedDataParallel, we need to divide the batch size + # ourselves based on the total number of GPUs we have + args.batch_size = int(args.batch_size / args.world_size) + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + else: + model.cuda() + # DistributedDataParallel will divide and allocate batch_size to all + # available GPUs if device_ids are not set + model = torch.nn.parallel.DistributedDataParallel(model) + elif args.gpu is not None: + torch.cuda.set_device(args.gpu) + model = model.cuda(args.gpu) + else: + # DataParallel will divide and allocate batch_size to all available GPUs + if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): + model.features = torch.nn.DataParallel(model.features) + model.cuda() + else: + model = torch.nn.DataParallel(model).cuda() + + # define loss function (criterion) and optimizer + criterion = nn.CrossEntropyLoss().cuda(args.gpu) + + # optimize only the linear classifier + parameters = list(filter(lambda p: p.requires_grad, model.parameters())) + assert len(parameters) == 2 # weight, bias + + optimizer = torch.optim.SGD(parameters, init_lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + if args.gpu is None: + checkpoint = torch.load(args.resume) + else: + # Map model to be loaded to specified single gpu. + loc = 'cuda:{}'.format(args.gpu) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + best_acc1 = checkpoint['best_acc1'] + if args.gpu is not None: + # best_acc1 may be from a checkpoint from a different GPU + best_acc1 = best_acc1.to(args.gpu) + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + cudnn.benchmark = True + + # Data loading code + traindir = os.path.join(args.data, 'train') + valdir = os.path.join(args.data, 'val') + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + train_dataset = datasets.ImageFolder( + traindir, + transforms.Compose([ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + + if args.distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + else: + train_sampler = None + + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=True, sampler=train_sampler) + + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(valdir, transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])), + batch_size=256, shuffle=False, + num_workers=args.workers, pin_memory=True) + + if args.evaluate: + validate(val_loader, model, criterion, args) + return + + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + train_sampler.set_epoch(epoch) + adjust_learning_rate(optimizer, init_lr, epoch, args) + + # train for one epoch + train(train_loader, model, criterion, optimizer, epoch, args) + + # evaluate on validation set + acc1 = validate(val_loader, model, criterion, args) + + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank == 0): # only the first GPU saves checkpoint + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer' : optimizer.state_dict(), + }, is_best) + if epoch == args.start_epoch: + sanity_check(model.state_dict(), args.pretrained, linear_keyword) + + +def train(train_loader, model, criterion, optimizer, epoch, args): + batch_time = AverageMeter('Time', ':6.3f') + data_time = AverageMeter('Data', ':6.3f') + losses = AverageMeter('Loss', ':.4e') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, losses, top1, top5], + prefix="Epoch: [{}]".format(epoch)) + + """ + Switch to eval mode: + Under the protocol of linear classification on frozen features/models, + it is not legitimate to change any part of the pre-trained model. + BatchNorm in train mode may revise running mean/std (even if it receives + no gradient), which are part of the model parameters too. + """ + model.eval() + + end = time.time() + for i, (images, target) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + + if args.gpu is not None: + images = images.cuda(args.gpu, non_blocking=True) + if torch.cuda.is_available(): + target = target.cuda(args.gpu, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + +def validate(val_loader, model, criterion, args): + batch_time = AverageMeter('Time', ':6.3f') + losses = AverageMeter('Loss', ':.4e') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(val_loader), + [batch_time, losses, top1, top5], + prefix='Test: ') + + # switch to evaluate mode + model.eval() + + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(val_loader): + if args.gpu is not None: + images = images.cuda(args.gpu, non_blocking=True) + if torch.cuda.is_available(): + target = target.cuda(args.gpu, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + # TODO: this should also be done with the ProgressMeter + print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' + .format(top1=top1, top5=top5)) + + return top1.avg + + +def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, 'model_best.pth.tar') + + +def sanity_check(state_dict, pretrained_weights, linear_keyword): + """ + Linear classifier should not change any weights other than the linear layer. + This sanity check asserts nothing wrong happens (e.g., BN stats updated). + """ + print("=> loading '{}' for sanity check".format(pretrained_weights)) + checkpoint = torch.load(pretrained_weights, map_location="cpu") + state_dict_pre = checkpoint['state_dict'] + + for k in list(state_dict.keys()): + # only ignore linear layer + if '%s.weight' % linear_keyword in k or '%s.bias' % linear_keyword in k: + continue + + # name in pretrained model + k_pre = 'module.base_encoder.' + k[len('module.'):] \ + if k.startswith('module.') else 'module.base_encoder.' + k + + assert ((state_dict[k].cpu() == state_dict_pre[k_pre]).all()), \ + '{} is changed in linear classifier training.'.format(k) + + print("=> sanity check passed.") + + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self, name, fmt=':f'): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print('\t'.join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = '{:' + str(num_digits) + 'd}' + return '[' + fmt + '/' + fmt.format(num_batches) + ']' + + +def adjust_learning_rate(optimizer, init_lr, epoch, args): + """Decay the learning rate based on schedule""" + cur_lr = init_lr * 0.5 * (1. + math.cos(math.pi * epoch / args.epochs)) + for param_group in optimizer.param_groups: + param_group['lr'] = cur_lr + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == '__main__': + main() diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/main_moco.py b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/main_moco.py new file mode 100644 index 0000000000000000000000000000000000000000..9fd74dd55e17734211b2e5d8f10a208e95ce33ad --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/main_moco.py @@ -0,0 +1,736 @@ +#!/usr/bin/env python + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import builtins +import math +import os +import random +import shutil +import time +import warnings +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.parallel +import torch.backends.cudnn as cudnn +import torch.distributed as dist +import torch.optim +import torch.multiprocessing as mp +import torch.utils.data +import torch.utils.data.distributed +import torchvision.transforms as transforms +import torchvision.datasets as datasets +import torchvision.models as torchvision_models +from torch.utils.tensorboard import SummaryWriter + +import moco.builder +import moco.loader +import moco.optimizer + +import vits +from misc import NativeScalerWithGradNormCount as NativeScaler + +torchvision_model_names = sorted(name for name in torchvision_models.__dict__ + if name.islower() and not name.startswith("__") + and callable(torchvision_models.__dict__[name])) + +model_names = ['vit_small', 'vit_base', 'vit_conv_small', 'vit_conv_base'] + torchvision_model_names + +parser = argparse.ArgumentParser(description='MoCo ImageNet Pre-Training') +parser.add_argument('data', default='/root/autodl-tmp/All', + help='path to dataset') +parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50', + choices=model_names, + help='model architecture: ' + + ' | '.join(model_names) + + ' (default: resnet50)') +parser.add_argument('-j', '--workers', default=32, type=int, metavar='N', + help='number of data loading workers (default: 32)') +parser.add_argument('--epochs', default=100, type=int, metavar='N', + help='number of total epochs to run') +parser.add_argument('--start-epoch', default=0, type=int, metavar='N', + help='manual epoch number (useful on restarts)') +parser.add_argument('-b', '--batch-size', default=1024, type=int, + metavar='N', + help='mini-batch size (default: 4096), this is the total ' + 'batch size of all GPUs on all nodes when ' + 'using Data Parallel or Distributed Data Parallel') +parser.add_argument('--lr', '--learning-rate', default=1.5e-4, type=float, + metavar='LR', help='initial (base) learning rate', dest='lr') +parser.add_argument('--momentum', default=0.9, type=float, metavar='M', + help='momentum') +parser.add_argument('--wd', '--weight-decay', default=1e-6, type=float, + metavar='W', help='weight decay (default: 1e-6)', + dest='weight_decay') +parser.add_argument('-p', '--print-freq', default=10, type=int, + metavar='N', help='print frequency (default: 10)') +parser.add_argument('--resume', default='', type=str, metavar='PATH', + help='path to latest checkpoint (default: none)') +parser.add_argument('--world-size', default=-1, type=int, + help='number of nodes for distributed training') +parser.add_argument('--rank', default=-1, type=int, + help='node rank for distributed training') +parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, + help='url used to set up distributed training') +parser.add_argument('--dist-backend', default='nccl', type=str, # nccl是GPU设备上最快、最推荐的后端 + help='distributed backend') +parser.add_argument('--seed', default=None, type=int, + help='seed for initializing training. ') +parser.add_argument('--gpu', default=None, type=int, + help='GPU id to use.') +parser.add_argument('--multiprocessing-distributed', action='store_true', + help='Use multi-processing distributed training to launch ' + 'N processes per node, which has N GPUs. This is the ' + 'fastest way to use PyTorch for either single node or ' + 'multi node data parallel training') +parser.add_argument('--saveckp_freq', default=50, type=int, help='Save checkpoint every x epochs.') + +# moco specific configs: +parser.add_argument('--moco-dim', default=256, type=int, + help='feature dimension (default: 256)') +parser.add_argument('--moco-mlp-dim', default=4096, type=int, + help='hidden dimension in MLPs (default: 4096)') +parser.add_argument('--moco-m', default=0.99, type=float, + help='moco momentum of updating momentum encoder (default: 0.99)') +parser.add_argument('--moco-m-cos', action='store_true', + help='gradually increase moco momentum to 1 with a ' + 'half-cycle cosine schedule') +parser.add_argument('--moco-t', default=1.0, type=float, + help='softmax temperature (default: 1.0)') + +# vit specific configs: +parser.add_argument('--stop-grad-conv1', action='store_true', + help='stop-grad after first conv, or patch embedding') + +# other upgrades +parser.add_argument('--optimizer', default='lars', type=str, + choices=['lars', 'adamw'], + help='optimizer used (default: lars)') +parser.add_argument('--warmup-epochs', default=20, type=int, metavar='N', + help='number of warmup epochs') +parser.add_argument('--crop-min', default=0.08, type=float, + help='minimum scale for random cropping (default: 0.08)') +parser.add_argument('--basic_state_dict', default='/root/autodl-tmp/ViT_b16_224_Imagenet.pth', type=str, + help='Load in pretrained or un-pretrained model pth') + + +def main(): + # 存入参数 + args = parser.parse_args() + # seed默认是None + # 关于种子的正确设定方式 + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) + cudnn.deterministic = True + warnings.warn('You have chosen to seed training. ' + 'This will turn on the CUDNN deterministic setting, ' + 'which can slow down your training considerably! ' + 'You may see unexpected behavior when restarting ' + 'from checkpoints.') + # cudnn.benchmark = False 这里应该还要加一行这个 + + # 默认是None,因为这个模型预训练工作量比较大,作者都是用几百片GPU或者TPU训练的。 + if args.gpu is not None: + warnings.warn('You have chosen a specific GPU. This will completely ' + 'disable data parallelism.') + + # dist_url默认值是'tcp://224.66.41.62:23456',应当是作者服务器第一个节点的地址 + # world_size默认值是-1 + # WORLD_SIZE由torch.distributed.launch.py产生 具体数值为 nproc_per_node*node(服务器数量或者节点数) + if args.dist_url == "env://" and args.world_size == -1: + args.world_size = int(os.environ["WORLD_SIZE"]) + + # multiprocessing_distributed的默认值为False + # 需要多进程运行程序时一定要使multiprocessing_distributed为True + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + + # 返回显卡数量 + ngpus_per_node = torch.cuda.device_count() + if args.multiprocessing_distributed: + # Since we have ngpus_per_node processes per node, + # the total world_size needs to be adjusted accordingly + # 计算总的GPU的数量 + args.world_size = ngpus_per_node * args.world_size + # Use torch.multiprocessing.spawn to launch distributed processes: + # the main_worker process function + # 开启多进程,每个进程调用main_worker函数,控制一个GPU。 + mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) + else: + # Simply call main_worker function + # args.gpu默认是None,如果不采用分布式,则通过这个参数输入用来计算的GPU的编号 + main_worker(args.gpu, ngpus_per_node, args) + + +def main_worker(gpu, ngpus_per_node, args): + args.gpu = gpu + + # suppress printing if not first GPU on each node + if args.multiprocessing_distributed and (args.gpu != 0 or args.rank != 0): + def print_pass(*args): + pass + + builtins.print = print_pass + + if args.gpu is not None: + print("Use GPU: {} for training".format(args.gpu)) + + if args.distributed: + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + if args.multiprocessing_distributed: + # For multiprocessing distributed training, rank needs to be the + # global rank among all the processes + args.rank = args.rank * ngpus_per_node + gpu + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + torch.distributed.barrier() + # create model + print("=> creating model '{}'".format(args.arch)) + if args.arch.startswith('vit'): + model = moco.builder.MoCo_ViT( + partial(vits.__dict__[args.arch], stop_grad_conv1=args.stop_grad_conv1), + args.moco_dim, args.moco_mlp_dim, args.moco_t) + if args.basic_state_dict is not None: # Transfer-learning + try: + basic_state_dict = torch.load(args.basic_state_dict) + if 'model' in basic_state_dict: + basic_state_dict = basic_state_dict['model'] + else: + pass + model.load_state_dict(basic_state_dict, False) + + except: + print('erro in args.basic_state_dict:', args.basic_state_dict) + print('PreTuningRestart') # 没倒进去 + + else: + print('PreTuning with Transfer-learning with:', args.basic_state_dict) + + else: + print('PreTuning Restart') + + else: + model = moco.builder.MoCo_ResNet( + partial(torchvision_models.__dict__[args.arch], zero_init_residual=True), + args.moco_dim, args.moco_mlp_dim, args.moco_t) + + # infer learning rate before changing batch size + args.lr = args.lr * args.batch_size / 256 + + if not torch.cuda.is_available(): + print('using CPU, this will be slow') + elif args.distributed: + # apply SyncBN + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) + # For multiprocessing distributed, DistributedDataParallel constructor + # should always set the single device scope, otherwise, + # DistributedDataParallel will use all available devices. + if args.gpu is not None: + torch.cuda.set_device(args.gpu) + model.cuda(args.gpu) + # When using a single GPU per process and per + # DistributedDataParallel, we need to divide the batch size + # ourselves based on the total number of GPUs we have + args.batch_size = int(args.batch_size / args.world_size) + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + else: + model.cuda() + # DistributedDataParallel will divide and allocate batch_size to all + # available GPUs if device_ids are not set + model = torch.nn.parallel.DistributedDataParallel(model) + elif args.gpu is not None: + torch.cuda.set_device(args.gpu) + model = model.cuda(args.gpu) + # comment out the following line for debugging + raise NotImplementedError("Only DistributedDataParallel is supported.") + else: + # AllGather/rank implementation in this code only supports DistributedDataParallel. + raise NotImplementedError("Only DistributedDataParallel is supported.") + print(model) # print model after SyncBatchNorm + + if args.optimizer == 'lars': + optimizer = moco.optimizer.LARS(model.parameters(), args.lr, + weight_decay=args.weight_decay, + momentum=args.momentum) + elif args.optimizer == 'adamw': + optimizer = torch.optim.AdamW(model.parameters(), args.lr, + weight_decay=args.weight_decay) + + scaler = torch.cuda.amp.GradScaler() + summary_writer = SummaryWriter() if args.rank == 0 else None + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + if args.gpu is None: + checkpoint = torch.load(args.resume) + else: + # Map model to be loaded to specified single gpu. + loc = 'cuda:{}'.format(args.gpu) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + scaler.load_state_dict(checkpoint['scaler']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + cudnn.benchmark = True + + # Data loading code + traindir = args.data + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + # follow BYOL's augmentation recipe: https://arxiv.org/abs/2006.07733 + augmentation1 = [ + transforms.RandomResizedCrop(224, scale=(args.crop_min, 1.)), + transforms.RandomApply([ + transforms.ColorJitter(0.4, 0.4, 0.2, 0.1) # not strengthened + ], p=0.8), + transforms.RandomGrayscale(p=0.2), + transforms.RandomApply([moco.loader.GaussianBlur([.1, 2.])], p=1.0), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize + ] + + augmentation2 = [ + transforms.RandomResizedCrop(224, scale=(args.crop_min, 1.)), + transforms.RandomApply([ + transforms.ColorJitter(0.4, 0.4, 0.2, 0.1) # not strengthened + ], p=0.8), + transforms.RandomGrayscale(p=0.2), + transforms.RandomApply([moco.loader.GaussianBlur([.1, 2.])], p=0.1), + transforms.RandomApply([moco.loader.Solarize()], p=0.2), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize + ] + + train_dataset = datasets.ImageFolder( + traindir, + moco.loader.TwoCropsTransform(transforms.Compose(augmentation1), + transforms.Compose(augmentation2))) + + if args.distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + else: + train_sampler = None + + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True) + + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + train_sampler.set_epoch(epoch) + + # train for one epoch + train(train_loader, model, optimizer, scaler, summary_writer, epoch, args) + + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank == 0): # only the first GPU saves checkpoint + initial_setting = os.path.split(args.basic_state_dict)[1].split('.')[0] # 'ViT_b16_224_Imagenet' + dataset_using = os.path.split(args.data)[1] + if args.saveckp_freq and (epoch % args.saveckp_freq == 0 or epoch + 1 == args.epochs): + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'optimizer': optimizer.state_dict(), + 'scaler': scaler.state_dict(), + }, is_best=False, + filename=os.path.join('/home/CPIA/saved_models/MOCO', 'moco_' + initial_setting + '_' + dataset_using + '_checkpoint_%04d.pth.tar' % epoch)) + print('moco_' + initial_setting + '_' + dataset_using + '_checkpoint_%04d.pth.tar' % epoch) + + + if args.rank == 0: + summary_writer.close() + + +def train(train_loader, model, optimizer, scaler, summary_writer, epoch, args): + batch_time = AverageMeter('Time', ':6.3f') + data_time = AverageMeter('Data', ':6.3f') + learning_rates = AverageMeter('LR', ':.4e') + losses = AverageMeter('Loss', ':.4e') + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, learning_rates, losses], + prefix="Epoch: [{}]".format(epoch)) + + # switch to train mode + model.train() + + end = time.time() + iters_per_epoch = len(train_loader) + moco_m = args.moco_m + for i, (images, _) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + + # adjust learning rate and momentum coefficient per iteration + lr = adjust_learning_rate(optimizer, epoch + i / iters_per_epoch, args) + learning_rates.update(lr) + if args.moco_m_cos: + moco_m = adjust_moco_momentum(epoch + i / iters_per_epoch, args) + + if args.gpu is not None: + images[0] = images[0].cuda(args.gpu, non_blocking=True) + images[1] = images[1].cuda(args.gpu, non_blocking=True) + + # compute output + with torch.cuda.amp.autocast(True): + loss = model(images[0], images[1], moco_m) + + losses.update(loss.item(), images[0].size(0)) + if args.rank == 0: + summary_writer.add_scalar("loss", loss.item(), epoch * iters_per_epoch + i) + + # compute gradient and do SGD step + optimizer.zero_grad() + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + +def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, 'model_best.pth.tar') + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self, name, fmt=':f'): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print('\t'.join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = '{:' + str(num_digits) + 'd}' + return '[' + fmt + '/' + fmt.format(num_batches) + ']' + + +def adjust_learning_rate(optimizer, epoch, args): + """Decays the learning rate with half-cycle cosine after warmup""" + if epoch < args.warmup_epochs: + lr = args.lr * epoch / args.warmup_epochs + else: + lr = args.lr * 0.5 * ( + 1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs))) + for param_group in optimizer.param_groups: + param_group['lr'] = lr + return lr + + +def adjust_moco_momentum(epoch, args): + """Adjust moco momentum based on current epoch""" + m = 1. - 0.5 * (1. + math.cos(math.pi * epoch / args.epochs)) * (1. - args.moco_m) + return m + + +if __name__ == '__main__': + main() +""" # infer learning rate before changing batch size + args.lr = args.lr * args.batch_size / 256 + + if not torch.cuda.is_available(): + print('using CPU, this will be slow') + elif args.distributed: + # apply SyncBN + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) + # For multiprocessing distributed, DistributedDataParallel constructor + # should always set the single device scope, otherwise, + # DistributedDataParallel will use all available devices. + if args.gpu is not None: + torch.cuda.set_device(args.gpu) + model.cuda(args.gpu) + # When using a single GPU per process and per + # DistributedDataParallel, we need to divide the batch size + # ourselves based on the total number of GPUs we have + args.batch_size = int(args.batch_size / args.world_size) + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + else: + model.cuda() + # DistributedDataParallel will divide and allocate batch_size to all + # available GPUs if device_ids are not set + model = torch.nn.parallel.DistributedDataParallel(model) + elif args.gpu is not None: + torch.cuda.set_device(args.gpu) + model = model.cuda(args.gpu) + # comment out the following line for debugging + raise NotImplementedError("Only DistributedDataParallel is supported.") + else: + # AllGather/rank implementation in this code only supports DistributedDataParallel. + raise NotImplementedError("Only DistributedDataParallel is supported.") + print(model) # print model after SyncBatchNorm + + if args.optimizer == 'lars': + optimizer = moco.optimizer.LARS(model.parameters(), args.lr, + weight_decay=args.weight_decay, + momentum=args.momentum) + elif args.optimizer == 'adamw': + optimizer = torch.optim.AdamW(model.parameters(), args.lr, + weight_decay=args.weight_decay) + + # 使用自动混合精度 + scaler = NativeScaler() + summary_writer = SummaryWriter() if args.rank == 0 else None + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + if args.gpu is None: + checkpoint = torch.load(args.resume) + else: + # Map model to be loaded to specified single gpu. + loc = 'cuda:{}'.format(args.gpu) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + scaler.load_state_dict(checkpoint['scaler']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + cudnn.benchmark = True + + # Data loading code + traindir = args.data + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + # follow BYOL's augmentation recipe: https://arxiv.org/abs/2006.07733 + augmentation1 = [ + transforms.RandomResizedCrop(224, scale=(args.crop_min, 1.)), + transforms.RandomApply([ + transforms.ColorJitter(0.4, 0.4, 0.2, 0.1) # not strengthened + ], p=0.8), + transforms.RandomGrayscale(p=0.2), + transforms.RandomApply([moco.loader.GaussianBlur([.1, 2.])], p=1.0), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize + ] + + augmentation2 = [ + transforms.RandomResizedCrop(224, scale=(args.crop_min, 1.)), + transforms.RandomApply([ + transforms.ColorJitter(0.4, 0.4, 0.2, 0.1) # not strengthened + ], p=0.8), + transforms.RandomGrayscale(p=0.2), + transforms.RandomApply([moco.loader.GaussianBlur([.1, 2.])], p=0.1), + transforms.RandomApply([moco.loader.Solarize()], p=0.2), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize + ] + + train_dataset = datasets.ImageFolder( + traindir, + moco.loader.TwoCropsTransform(transforms.Compose(augmentation1), + transforms.Compose(augmentation2))) + + if args.distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + else: + train_sampler = None + + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True) + + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + train_sampler.set_epoch(epoch) + + # train for one epoch + train(train_loader, model, optimizer, scaler, summary_writer, epoch, args) + + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank == 0): # only the first GPU saves checkpoint + if epoch % 10 == 0: + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'optimizer': optimizer.state_dict(), + 'scaler': scaler.state_dict(), + }, is_best=False, filename='moco_checkpoint_%04d.pth.tar' % epoch) + if args.rank == 0: + summary_writer.close() + + +def train(train_loader, model, optimizer, scaler, summary_writer, epoch, args): + batch_time = AverageMeter('Time', ':6.3f') + data_time = AverageMeter('Data', ':6.3f') + learning_rates = AverageMeter('LR', ':.4e') + losses = AverageMeter('Loss', ':.4e') + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, learning_rates, losses], + prefix="Epoch: [{}]".format(epoch)) + + # switch to train mode + model.train() + + end = time.time() + iters_per_epoch = len(train_loader) + moco_m = args.moco_m + for i, (images, _) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + + # adjust learning rate and momentum coefficient per iteration + lr = adjust_learning_rate(optimizer, epoch + i / iters_per_epoch, args) + learning_rates.update(lr) + if args.moco_m_cos: + moco_m = adjust_moco_momentum(epoch + i / iters_per_epoch, args) + + if args.gpu is not None: + images[0] = images[0].cuda(args.gpu, non_blocking=True) + images[1] = images[1].cuda(args.gpu, non_blocking=True) + + # compute output + with torch.cuda.amp.autocast(True): + loss = model(images[0], images[1], moco_m) + + losses.update(loss.item(), images[0].size(0)) + if args.rank == 0: + summary_writer.add_scalar("loss", loss.item(), epoch * iters_per_epoch + i) + + # compute gradient and do SGD step + optimizer.zero_grad() + # fixme 尝试使用梯度裁剪,集成在misc中 + """'''scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update()'''""" + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + +def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, 'model_best.pth.tar') + + +class AverageMeter(object): + """'''Computes and stores the average and current value'''""" + def __init__(self, name, fmt=':f'): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print('\t'.join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = '{:' + str(num_digits) + 'd}' + return '[' + fmt + '/' + fmt.format(num_batches) + ']' + + +def adjust_learning_rate(optimizer, epoch, args): + """'''Decays the learning rate with half-cycle cosine after warmup'''""" + if epoch < args.warmup_epochs: + lr = args.lr * epoch / args.warmup_epochs + else: + lr = args.lr * 0.5 * (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs))) + for param_group in optimizer.param_groups: + param_group['lr'] = lr + return lr + + +def adjust_moco_momentum(epoch, args): + """'''Adjust moco momentum based on current epoch'''""" + m = 1. - 0.5 * (1. + math.cos(math.pi * epoch / args.epochs)) * (1. - args.moco_m) + return m + + +if __name__ == '__main__': + main()""" diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/misc.py b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..cc64f1b317eaa7985adfd5ab99d2c9ef1131fd42 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/misc.py @@ -0,0 +1,390 @@ +""" +funcs Script ver: Aug 15th 19:00 +有修改loss backward +""" +import builtins +import datetime +import os +import time +from collections import defaultdict, deque +from pathlib import Path + +import torch +import torch.distributed as dist +from torch._six import inf + + +# SmoothedValue operator +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + if not is_dist_avail_and_initialized(): + return + t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') + dist.barrier() + dist.all_reduce(t) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value) + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) # SmoothedValue operator + self.delimiter = delimiter + + def update(self, **kwargs): # 更新内容字典 + for k, v in kwargs.items(): + if v is None: + continue + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): # 报错 + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): # 转换为str给print + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append( + "{}: {}".format(name, str(meter)) + ) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): # 多进程同步 + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): # 新增一个indicator元素 + self.meters[name] = meter + + def log_every(self, iterable, print_freq, header=None): # warp minibatch + # 初始化迭代idx + i = 0 + # 初始化头文件 + if not header: + header = '' + + # 初始化计时 + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt='{avg:.4f}') + data_time = SmoothedValue(fmt='{avg:.4f}') + space_fmt = ':' + str(len(str(len(iterable)))) + 'd' + # 初始化输出 + log_msg = [ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}' + ] + if torch.cuda.is_available(): + log_msg.append('max mem: {memory:.0f}') + + log_msg = self.delimiter.join(log_msg) # 缩进 + + MB = 1024.0 * 1024.0 + + for obj in iterable: + + data_time.update(time.time() - end) + yield obj # 生成迭代的下一个对象 + iter_time.update(time.time() - end) + + if i % print_freq == 0 or i == len(iterable) - 1: + # 估算时间 + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + # 输出 + if torch.cuda.is_available(): + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB)) + else: + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time))) + i += 1 + end = time.time() + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('{} Total time: {} ({:.4f} s / it)'.format( + header, total_time_str, total_time / len(iterable))) + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + builtin_print = builtins.print + + def print(*args, **kwargs): + force = kwargs.pop('force', False) + force = force or (get_world_size() > 8) + if is_master or force: + now = datetime.datetime.now().time() + builtin_print('[{}] '.format(now), end='') # print with time stamp + builtin_print(*args, **kwargs) + + builtins.print = print + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(*args, **kwargs): + if is_main_process(): + torch.save(*args, **kwargs) + + +def init_distributed_mode(args): + """ + 配置多服务器环境文件信息,安排args.distributed + :param args: + :return: + """ + if args.dist_on_itp: + args.rank = int(os.environ['OMPI_COMM_WORLD_RANK']) + args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) + args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) + args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT']) + os.environ['LOCAL_RANK'] = str(args.gpu) + os.environ['RANK'] = str(args.rank) + os.environ['WORLD_SIZE'] = str(args.world_size) + # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"] + + elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ['WORLD_SIZE']) + args.gpu = int(os.environ['LOCAL_RANK']) + + elif 'SLURM_PROCID' in os.environ: + args.rank = int(os.environ['SLURM_PROCID']) + args.gpu = args.rank % torch.cuda.device_count() + + else: + print('Not using distributed mode') + setup_for_distributed(is_master=True) # hack + args.distributed = False + return + + args.distributed = True + + torch.cuda.set_device(args.gpu) + args.dist_backend = 'nccl' + print('| distributed init (rank {}): {}, gpu {}'.format( + args.rank, args.dist_url, args.gpu), flush=True) + torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + torch.distributed.barrier() + setup_for_distributed(args.rank == 0) + + +class NativeScalerWithGradNormCount: + """ + 定义的 loss 优化器 + 基于自动混合精度训练设置的loss_scaler,额外增加了梯度裁剪的功能 + """ + state_dict_key = "amp_scaler" + + def __init__(self): + # 自动混合精度 + self._scaler = torch.cuda.amp.GradScaler() + + def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True): + + # 反传 + # fixme 这里有修改 加了torch.ones_like(loss)解决梯度标量问题,不知道为啥存在 + # fixme 反转了,后来发现又不用加上torch.ones_like(loss) + self._scaler.scale(loss).backward(create_graph=create_graph) # create_graph + + if update_grad: + # 梯度裁剪 + if clip_grad is not None: + assert parameters is not None + self._scaler.unscale_(optimizer) # unscale the gradients of optimizer's assigned params in-place + norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad) + else: + self._scaler.unscale_(optimizer) + norm = get_grad_norm_(parameters) + + self._scaler.step(optimizer) # 使用optimizer更新模型 + + self._scaler.update() + else: + norm = None + + return norm + + def state_dict(self): # 记录loss_scaler的state_dict,应该就是保存梯度 + return self._scaler.state_dict() + + def load_state_dict(self, state_dict): # 还原某个checkpoint的state_dict + self._scaler.load_state_dict(state_dict) + + +def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor: + + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + + # 确定需要梯度的模型参数 + parameters = [p for p in parameters if p.grad is not None] + norm_type = float(norm_type) + + if len(parameters) == 0: + return torch.tensor(0.) + + # 从对应GPU上进行操作 + device = parameters[0].grad.device + + if norm_type == inf: + # 面对norm_type == inf爆炸值,保留 + total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters) + else: + # 无norm_type == inf爆炸值,做norm + total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) + + return total_norm + + +def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler, model_idx='SAE_'): + output_dir = Path(args.output_dir) + epoch_name = str(epoch) + + if loss_scaler is not None: + checkpoint_paths = [output_dir / (model_idx+'_checkpoint-%s.pth' % epoch_name)] + for checkpoint_path in checkpoint_paths: + to_save = { + 'model': model_without_ddp.state_dict(), + 'optimizer': optimizer.state_dict(), + 'epoch': epoch, + 'scaler': loss_scaler.state_dict(), + 'args': args, # 保存配置参数,但是在加载的时候不加载 + } + + save_on_master(to_save, checkpoint_path) + else: + client_state = {'epoch': epoch} + model.save_checkpoint(save_dir=args.output_dir, tag="checkpoint-%s" % epoch_name, client_state=client_state) + + +def load_model(args, model_without_ddp, optimizer, loss_scaler): + + # 加载配置checkpoint的路径args.resume,默认没有则不加载 + if args.resume: + if args.resume.startswith('https'): + checkpoint = torch.hub.load_state_dict_from_url( + args.resume, map_location='cpu', check_hash=True) + else: + checkpoint = torch.load(args.resume, map_location='cpu') + + model_without_ddp.load_state_dict(checkpoint['model']) + + print("Resume checkpoint %s" % args.resume) + + if 'optimizer' in checkpoint and 'epoch' in checkpoint and not (hasattr(args, 'eval') and args.eval): + optimizer.load_state_dict(checkpoint['optimizer']) + args.start_epoch = checkpoint['epoch'] + 1 + + if 'scaler' in checkpoint: + loss_scaler.load_state_dict(checkpoint['scaler']) + + print("With optim & sched!") + + +# 计算平均在单卡上的loss +def all_reduce_mean(x): + world_size = get_world_size() + + if world_size > 1: + x_reduce = torch.tensor(x).cuda() + dist.all_reduce(x_reduce) + x_reduce /= world_size + return x_reduce.item() + + else: + return x \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/moco/__init__.py b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/moco/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..168f9979a4623806934b0ff1102ac166704e7dec --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/moco/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/moco/builder.py b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/moco/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..268bd833d33d67deb65bb58d4974897bd176941a --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/moco/builder.py @@ -0,0 +1,137 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn + + +class MoCo(nn.Module): + """ + Build a MoCo model with a base encoder, a momentum encoder, and two MLPs + https://arxiv.org/abs/1911.05722 + """ + def __init__(self, base_encoder, dim=256, mlp_dim=4096, T=1.0): + """ + dim: feature dimension (default: 256) + mlp_dim: hidden dimension in MLPs (default: 4096) + T: softmax temperature (default: 1.0) + """ + super(MoCo, self).__init__() + + self.T = T + + # build encoders + self.base_encoder = base_encoder(num_classes=mlp_dim) + self.momentum_encoder = base_encoder(num_classes=mlp_dim) + + self._build_projector_and_predictor_mlps(dim, mlp_dim) + + for param_b, param_m in zip(self.base_encoder.parameters(), self.momentum_encoder.parameters()): + param_m.data.copy_(param_b.data) # initialize + param_m.requires_grad = False # not update by gradient + + def _build_mlp(self, num_layers, input_dim, mlp_dim, output_dim, last_bn=True): + mlp = [] + for l in range(num_layers): + dim1 = input_dim if l == 0 else mlp_dim + dim2 = output_dim if l == num_layers - 1 else mlp_dim + + mlp.append(nn.Linear(dim1, dim2, bias=False)) + + if l < num_layers - 1: + mlp.append(nn.BatchNorm1d(dim2)) + mlp.append(nn.ReLU(inplace=True)) + elif last_bn: + # follow SimCLR's design: https://github.com/google-research/simclr/blob/master/model_util.py#L157 + # for simplicity, we further removed gamma in BN + mlp.append(nn.BatchNorm1d(dim2, affine=False)) + + return nn.Sequential(*mlp) + + def _build_projector_and_predictor_mlps(self, dim, mlp_dim): + pass + + @torch.no_grad() + def _update_momentum_encoder(self, m): + """Momentum update of the momentum encoder""" + for param_b, param_m in zip(self.base_encoder.parameters(), self.momentum_encoder.parameters()): + param_m.data = param_m.data * m + param_b.data * (1. - m) + + def contrastive_loss(self, q, k): + # normalize + q = nn.functional.normalize(q, dim=1) + k = nn.functional.normalize(k, dim=1) + # gather all targets + k = concat_all_gather(k) + # Einstein sum is more intuitive + logits = torch.einsum('nc,mc->nm', [q, k]) / self.T + N = logits.shape[0] # batch size per GPU + labels = (torch.arange(N, dtype=torch.long) + N * torch.distributed.get_rank()).cuda() + return nn.CrossEntropyLoss()(logits, labels) * (2 * self.T) + + def forward(self, x1, x2, m): + """ + Input: + x1: first views of images + x2: second views of images + m: moco momentum + Output: + loss + """ + + # compute features + q1 = self.predictor(self.base_encoder(x1)) + q2 = self.predictor(self.base_encoder(x2)) + + with torch.no_grad(): # no gradient + self._update_momentum_encoder(m) # update the momentum encoder + + # compute momentum features as targets + k1 = self.momentum_encoder(x1) + k2 = self.momentum_encoder(x2) + + return self.contrastive_loss(q1, k2) + self.contrastive_loss(q2, k1) + + +class MoCo_ResNet(MoCo): + def _build_projector_and_predictor_mlps(self, dim, mlp_dim): + hidden_dim = self.base_encoder.fc.weight.shape[1] + del self.base_encoder.fc, self.momentum_encoder.fc # remove original fc layer + + # projectors + self.base_encoder.fc = self._build_mlp(2, hidden_dim, mlp_dim, dim) + self.momentum_encoder.fc = self._build_mlp(2, hidden_dim, mlp_dim, dim) + + # predictor + self.predictor = self._build_mlp(2, dim, mlp_dim, dim, False) + + +class MoCo_ViT(MoCo): + def _build_projector_and_predictor_mlps(self, dim, mlp_dim): + hidden_dim = self.base_encoder.head.weight.shape[1] + del self.base_encoder.head, self.momentum_encoder.head # remove original fc layer + + # projectors + self.base_encoder.head = self._build_mlp(3, hidden_dim, mlp_dim, dim) + self.momentum_encoder.head = self._build_mlp(3, hidden_dim, mlp_dim, dim) + + # predictor + self.predictor = self._build_mlp(2, dim, mlp_dim, dim) + + +# utils +@torch.no_grad() +def concat_all_gather(tensor): + """ + Performs all_gather operation on the provided tensors. + *** Warning ***: torch.distributed.all_gather has no gradient. + """ + tensors_gather = [torch.ones_like(tensor) + for _ in range(torch.distributed.get_world_size())] + torch.distributed.all_gather(tensors_gather, tensor, async_op=False) + + output = torch.cat(tensors_gather, dim=0) + return output diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/moco/loader.py b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/moco/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..a1b931a7f5573ea3108cdc5820fd39ee76fe2dd9 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/moco/loader.py @@ -0,0 +1,42 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from PIL import Image, ImageFilter, ImageOps +import math +import random +import torchvision.transforms.functional as tf + + +class TwoCropsTransform: + """Take two random crops of one image""" + + def __init__(self, base_transform1, base_transform2): + self.base_transform1 = base_transform1 + self.base_transform2 = base_transform2 + + def __call__(self, x): + im1 = self.base_transform1(x) + im2 = self.base_transform2(x) + return [im1, im2] + + +class GaussianBlur(object): + """Gaussian blur augmentation from SimCLR: https://arxiv.org/abs/2002.05709""" + + def __init__(self, sigma=[.1, 2.]): + self.sigma = sigma + + def __call__(self, x): + sigma = random.uniform(self.sigma[0], self.sigma[1]) + x = x.filter(ImageFilter.GaussianBlur(radius=sigma)) + return x + + +class Solarize(object): + """Solarize augmentation from BYOL: https://arxiv.org/abs/2006.07733""" + + def __call__(self, x): + return ImageOps.solarize(x) \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/moco/optimizer.py b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/moco/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..dd3995a630f094db20e4f8b68e8550673f40166f --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/moco/optimizer.py @@ -0,0 +1,43 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch + + +class LARS(torch.optim.Optimizer): + """ + LARS optimizer, no rate scaling or weight decay for parameters <= 1D. + """ + def __init__(self, params, lr=0, weight_decay=0, momentum=0.9, trust_coefficient=0.001): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, trust_coefficient=trust_coefficient) + super().__init__(params, defaults) + + @torch.no_grad() + def step(self): + for g in self.param_groups: + for p in g['params']: + dp = p.grad + + if dp is None: + continue + + if p.ndim > 1: # if not normalization gamma/beta or bias + dp = dp.add(p, alpha=g['weight_decay']) + param_norm = torch.norm(p) + update_norm = torch.norm(dp) + one = torch.ones_like(param_norm) + q = torch.where(param_norm > 0., + torch.where(update_norm > 0, + (g['trust_coefficient'] * param_norm / update_norm), one), + one) + dp = dp.mul(q) + + param_state = self.state[p] + if 'mu' not in param_state: + param_state['mu'] = torch.zeros_like(p) + mu = param_state['mu'] + mu.mul_(g['momentum']).add_(dp) + p.add_(mu, alpha=-g['lr']) diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/transfer/README.md b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/transfer/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1b2e10ac6b002d6a0ac94e550f1aa5b73bda0ef9 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/transfer/README.md @@ -0,0 +1,128 @@ +## MoCo v3 Transfer Learning with ViT + +This folder includes the transfer learning experiments on CIFAR-10, CIFAR-100, Flowers and Pets datasets. We provide finetuning recipes for the ViT-Base model. + +### Transfer Results + +The following results are based on ImageNet-1k self-supervised pre-training, followed by end-to-end fine-tuning on downstream datasets. All results are based on a batch size of 128 and 100 training epochs. + +#### ViT-Base, transfer learning + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
datasetpretrain
epochs
pretrain
crops
finetune
epochs
transfer
acc
CIFAR-103002x22410098.9
CIFAR-1003002x22410090.5
Flowers3002x22410097.7
Pets3002x22410093.2
+ +Similar to the end-to-end fine-tuning experiment on ImageNet, the transfer learning results are also obtained using the [DEiT](https://github.com/facebookresearch/deit) repo, with the default model [deit_base_patch16_224]. + +### Preparation: Transfer learning with ViT + +To perform transfer learning for ViT, use our script to convert the pre-trained ViT checkpoint to [DEiT](https://github.com/facebookresearch/deit) format: +``` +python convert_to_deit.py \ + --input [your checkpoint path]/[your checkpoint file].pth.tar \ + --output [target checkpoint file].pth +``` +Then copy (or replace) the following files to the DeiT folder: +``` +datasets.py +oxford_flowers_dataset.py +oxford_pets_dataset.py +``` + +#### Download and prepare the datasets + +Pets [\[Homepage\]](https://www.robots.ox.ac.uk/~vgg/data/pets/) +``` +./data/ +└── ./data/pets/ + ├── ./data/pets/annotations/ # split and label files + └── ./data/pets/images/ # data images +``` + +Flowers [\[Homepage\]](https://www.robots.ox.ac.uk/~vgg/data/flowers/102/) +``` +./data/ +└── ./data/flowers/ + ├── ./data/flowers/jpg/ # jpg images + ├── ./data/flowers/setid.mat # dataset split + └── ./data/flowers/imagelabels.mat # labels +``` + + +CIFAR-10/CIFAR-100 datasets will be downloaded automatically. + + +### Transfer learning scripts (with a 8-GPU machine): + +#### CIFAR-10 +``` +python -u -m torch.distributed.launch --nproc_per_node=8 --use_env main.py \ + --batch-size 128 --output_dir [your output dir path] --epochs 100 --lr 3e-4 --weight-decay 0.1 \ + --no-pin-mem --warmup-epochs 3 --data-set cifar10 --data-path [cifar-10 data path] --no-repeated-aug \ + --resume [your pretrain checkpoint file] \ + --reprob 0.0 --drop-path 0.1 --mixup 0.8 --cutmix 1 +``` + +#### CIFAR-100 +``` +python -u -m torch.distributed.launch --nproc_per_node=8 --use_env main.py \ + --batch-size 128 --output_dir [your output dir path] --epochs 100 --lr 3e-4 --weight-decay 0.1 \ + --no-pin-mem --warmup-epochs 3 --data-set cifar100 --data-path [cifar-100 data path] --no-repeated-aug \ + --resume [your pretrain checkpoint file] \ + --reprob 0.0 --drop-path 0.1 --mixup 0.5 --cutmix 1 +``` + +#### Flowers +``` +python -u -m torch.distributed.launch --nproc_per_node=8 --use_env main.py \ + --batch-size 128 --output_dir [your output dir path] --epochs 100 --lr 3e-4 --weight-decay 0.3 \ + --no-pin-mem --warmup-epochs 3 --data-set flowers --data-path [oxford-flowers data path] --no-repeated-aug \ + --resume [your pretrain checkpoint file] \ + --reprob 0.25 --drop-path 0.1 --mixup 0 --cutmix 0 +``` + +#### Pets +``` +python -u -m torch.distributed.launch --nproc_per_node=8 --use_env main.py \ + --batch-size 128 --output_dir [your output dir path] --epochs 100 --lr 3e-4 --weight-decay 0.1 \ + --no-pin-mem --warmup-epochs 3 --data-set pets --data-path [oxford-pets data path] --no-repeated-aug \ + --resume [your pretrain checkpoint file] \ + --reprob 0 --drop-path 0 --mixup 0.8 --cutmix 0 +``` + +**Note**: +Similar to the ImageNet end-to-end finetuning experiment, we use `--resume` rather than `--finetune` in the DeiT repo, as its `--finetune` option trains under eval mode. When loading the pre-trained model, revise `model_without_ddp.load_state_dict(checkpoint['model'])` with `strict=False`. diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/transfer/datasets.py b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/transfer/datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..6c957d4f78dd1a682856740b478cf337e08e9eab --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/transfer/datasets.py @@ -0,0 +1,74 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os + +from torchvision import datasets, transforms +from torchvision.datasets.folder import ImageFolder, default_loader + +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD + +import oxford_flowers_dataset, oxford_pets_dataset + + +def build_transform(is_train, args): + transform_train = transforms.Compose([ + transforms.RandomResizedCrop((args.input_size, args.input_size), scale=(0.05, 1.0)), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), + ]) + transform_test = transforms.Compose([ + transforms.Resize(int((256 / 224) * args.input_size)), + transforms.CenterCrop(args.input_size), + transforms.ToTensor(), + transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), + ]) + return transform_train if is_train else transform_test + + +def build_dataset(is_train, args): + transform = build_transform(is_train, args) + + if args.data_set == 'imagenet': + raise NotImplementedError("Only [cifar10, cifar100, flowers, pets] are supported; \ + for imagenet end-to-end finetuning, please refer to the instructions in the main README.") + + if args.data_set == 'imagenet': + root = os.path.join(args.data_path, 'train' if is_train else 'val') + dataset = datasets.ImageFolder(root, transform=transform) + nb_classes = 1000 + + elif args.data_set == 'cifar10': + dataset = datasets.CIFAR10(root=args.data_path, + train=is_train, + download=True, + transform=transform) + nb_classes = 10 + elif args.data_set == "cifar100": + dataset = datasets.CIFAR100(root=args.data_path, + train=is_train, + download=True, + transform=transform) + nb_classes = 100 + elif args.data_set == "flowers": + dataset = oxford_flowers_dataset.Flowers(root=args.data_path, + train=is_train, + download=False, + transform=transform) + nb_classes = 102 + elif args.data_set == "pets": + dataset = oxford_pets_dataset.Pets(root=args.data_path, + train=is_train, + download=False, + transform=transform) + nb_classes = 37 + else: + raise NotImplementedError("Only [cifar10, cifar100, flowers, pets] are supported; \ + for imagenet end-to-end finetuning, please refer to the instructions in the main README.") + + return dataset, nb_classes diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/transfer/oxford_flowers_dataset.py b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/transfer/oxford_flowers_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..13f48a963407c732813db9af86c70110b478f99c --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/transfer/oxford_flowers_dataset.py @@ -0,0 +1,67 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from __future__ import print_function +from PIL import Image +from typing import Any, Callable, Optional, Tuple + +import numpy as np +import os +import os.path +import pickle +import scipy.io + +from torchvision.datasets.vision import VisionDataset + + +class Flowers(VisionDataset): + + def __init__( + self, + root, + train=True, + transform=None, + target_transform=None, + download=False, + ): + + super(Flowers, self).__init__(root, transform=transform, + target_transform=target_transform) + + base_folder = root + self.image_folder = os.path.join(base_folder, "jpg") + label_file = os.path.join(base_folder, "imagelabels.mat") + setid_file = os.path.join(base_folder, "setid.mat") + + self.train = train + + self.labels = scipy.io.loadmat(label_file)["labels"][0] + train_list = scipy.io.loadmat(setid_file)["trnid"][0] + val_list = scipy.io.loadmat(setid_file)["valid"][0] + test_list = scipy.io.loadmat(setid_file)["tstid"][0] + trainval_list = np.concatenate([train_list, val_list]) + + if self.train: + self.img_files = trainval_list + else: + self.img_files = test_list + + + def __getitem__(self, index): + img_name = "image_%05d.jpg" % self.img_files[index] + target = self.labels[self.img_files[index] - 1] - 1 + img = Image.open(os.path.join(self.image_folder, img_name)) + + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + target = self.target_transform(target) + + return img, target + + def __len__(self): + return len(self.img_files) diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/transfer/oxford_pets_dataset.py b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/transfer/oxford_pets_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..8aa026ff970c30d6ce2f9a2d10fa583adc9711e0 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/transfer/oxford_pets_dataset.py @@ -0,0 +1,67 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from PIL import Image +from typing import Any, Callable, Optional, Tuple + +import numpy as np +import os +import os.path +import pickle +import scipy.io + +from torchvision.datasets.vision import VisionDataset + + +class Pets(VisionDataset): + + def __init__( + self, + root: str, + train: bool = True, + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + download: bool = False, + ) -> None: + + super(Pets, self).__init__(root, transform=transform, + target_transform=target_transform) + + base_folder = root + self.train = train + annotations_path_dir = os.path.join(base_folder, "annotations") + self.image_path_dir = os.path.join(base_folder, "images") + + if self.train: + split_file = os.path.join(annotations_path_dir, "trainval.txt") + with open(split_file) as f: + self.images_list = f.readlines() + else: + split_file = os.path.join(annotations_path_dir, "test.txt") + with open(split_file) as f: + self.images_list = f.readlines() + + + def __getitem__(self, index: int) -> Tuple[Any, Any]: + + img_name, label, species, _ = self.images_list[index].strip().split(" ") + + img_name += ".jpg" + target = int(label) - 1 + + img = Image.open(os.path.join(self.image_path_dir, img_name)) + img = img.convert('RGB') + + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + target = self.target_transform(target) + + return img, target + + def __len__(self) -> int: + return len(self.images_list) diff --git a/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/vits.py b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/vits.py new file mode 100644 index 0000000000000000000000000000000000000000..11e22ba5a0ff606e7f101b1833b16ab9a823914a --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/moco-v3-main/vits.py @@ -0,0 +1,143 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math +import torch +import torch.nn as nn +from functools import partial, reduce +from operator import mul + +from timm.models.vision_transformer import VisionTransformer, _cfg +from timm.models.layers.helpers import to_2tuple +from timm.models.layers import PatchEmbed + +__all__ = [ + 'vit_small', + 'vit_base', + 'vit_conv_small', + 'vit_conv_base', +] + + +class VisionTransformerMoCo(VisionTransformer): + def __init__(self, stop_grad_conv1=False, **kwargs): + super().__init__(**kwargs) + # Use fixed 2D sin-cos position embedding + self.build_2d_sincos_position_embedding() + + # weight initialization + for name, m in self.named_modules(): + if isinstance(m, nn.Linear): + if 'qkv' in name: + # treat the weights of Q, K, V separately + val = math.sqrt(6. / float(m.weight.shape[0] // 3 + m.weight.shape[1])) + nn.init.uniform_(m.weight, -val, val) + else: + nn.init.xavier_uniform_(m.weight) + nn.init.zeros_(m.bias) + nn.init.normal_(self.cls_token, std=1e-6) + + if isinstance(self.patch_embed, PatchEmbed): + # xavier_uniform initialization + val = math.sqrt(6. / float(3 * reduce(mul, self.patch_embed.patch_size, 1) + self.embed_dim)) + nn.init.uniform_(self.patch_embed.proj.weight, -val, val) + nn.init.zeros_(self.patch_embed.proj.bias) + + if stop_grad_conv1: + self.patch_embed.proj.weight.requires_grad = False + self.patch_embed.proj.bias.requires_grad = False + + def build_2d_sincos_position_embedding(self, temperature=10000.): + h, w = self.patch_embed.grid_size + grid_w = torch.arange(w, dtype=torch.float32) + grid_h = torch.arange(h, dtype=torch.float32) + grid_w, grid_h = torch.meshgrid(grid_w, grid_h) + assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' + pos_dim = self.embed_dim // 4 + omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim + omega = 1. / (temperature**omega) + out_w = torch.einsum('m,d->md', [grid_w.flatten(), omega]) + out_h = torch.einsum('m,d->md', [grid_h.flatten(), omega]) + pos_emb = torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], dim=1)[None, :, :] + + assert self.num_tokens == 1, 'Assuming one and only one token, [cls]' + pe_token = torch.zeros([1, 1, self.embed_dim], dtype=torch.float32) + self.pos_embed = nn.Parameter(torch.cat([pe_token, pos_emb], dim=1)) + self.pos_embed.requires_grad = False + + +class ConvStem(nn.Module): + """ + ConvStem, from Early Convolutions Help Transformers See Better, Tete et al. https://arxiv.org/abs/2106.14881 + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True): + super().__init__() + + assert patch_size == 16, 'ConvStem only supports patch size of 16' + assert embed_dim % 8 == 0, 'Embed dimension must be divisible by 8 for ConvStem' + + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + + # build stem, similar to the design in https://arxiv.org/abs/2106.14881 + stem = [] + input_dim, output_dim = 3, embed_dim // 8 + for l in range(4): + stem.append(nn.Conv2d(input_dim, output_dim, kernel_size=3, stride=2, padding=1, bias=False)) + stem.append(nn.BatchNorm2d(output_dim)) + stem.append(nn.ReLU(inplace=True)) + input_dim = output_dim + output_dim *= 2 + stem.append(nn.Conv2d(input_dim, embed_dim, kernel_size=1)) + self.proj = nn.Sequential(*stem) + + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x): + B, C, H, W = x.shape + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x) + if self.flatten: + x = x.flatten(2).transpose(1, 2) # BCHW -> BNC + x = self.norm(x) + return x + + +def vit_small(**kwargs): + model = VisionTransformerMoCo( + patch_size=16, embed_dim=384, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + model.default_cfg = _cfg() + return model + +def vit_base(**kwargs): + model = VisionTransformerMoCo( + patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + model.default_cfg = _cfg() + return model + +def vit_conv_small(**kwargs): + # minus one ViT block + model = VisionTransformerMoCo( + patch_size=16, embed_dim=384, depth=11, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), embed_layer=ConvStem, **kwargs) + model.default_cfg = _cfg() + return model + +def vit_conv_base(**kwargs): + # minus one ViT block + model = VisionTransformerMoCo( + patch_size=16, embed_dim=768, depth=11, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), embed_layer=ConvStem, **kwargs) + model.default_cfg = _cfg() + return model \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/LICENSE.txt b/PuzzleTuning/Counterpart PreTrain Methods/simclr/LICENSE.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd1b9289818f0672085d7b04bc217d79331c0bd7 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Thalles Silva + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/README.md b/PuzzleTuning/Counterpart PreTrain Methods/simclr/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dc94e88615990ea21264c35b1f85722d06187c5e --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/README.md @@ -0,0 +1,14 @@ +# SimCLR + +The original repo of SimCLR could be found [here](https://github.com/sthalles/SimCLR) + +To install environments: +```bash +pip install -r requirements.txt +``` + +To start pretraining: +```bash +# You need to alter the script according to your directories +bash pretrain.sh +``` diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/README_origin.md b/PuzzleTuning/Counterpart PreTrain Methods/simclr/README_origin.md new file mode 100644 index 0000000000000000000000000000000000000000..2d908618065dfe494a448029dd96be3999a000ce --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/README_origin.md @@ -0,0 +1,47 @@ +# PyTorch SimCLR: A Simple Framework for Contrastive Learning of Visual Representations +[![DOI](https://zenodo.org/badge/241184407.svg)](https://zenodo.org/badge/latestdoi/241184407) + + +### Blog post with full documentation: [Exploring SimCLR: A Simple Framework for Contrastive Learning of Visual Representations](https://sthalles.github.io/simple-self-supervised-learning/) + +![Image of SimCLR Arch](https://sthalles.github.io/assets/contrastive-self-supervised/cover.png) + +### See also [PyTorch Implementation for BYOL - Bootstrap Your Own Latent: A New Approach to Self-Supervised Learning](https://github.com/sthalles/PyTorch-BYOL). + +## Installation + +``` +$ conda env create --name simclr --file env.yml +$ conda activate simclr +$ python run.py +``` + +## Config file + +Before running SimCLR, make sure you choose the correct running configurations. You can change the running configurations by passing keyword arguments to the ```run.py``` file. + +```python + +$ python run.py -data ./datasets --dataset-name stl10 --log-every-n-steps 100 --epochs 100 + +``` + +If you want to run it on CPU (for debugging purposes) use the ```--disable-cuda``` option. + +For 16-bit precision GPU training, there **NO** need to to install [NVIDIA apex](https://github.com/NVIDIA/apex). Just use the ```--fp16_precision``` flag and this implementation will use [Pytorch built in AMP training](https://pytorch.org/docs/stable/notes/amp_examples.html). + +## Feature Evaluation + +Feature evaluation is done using a linear model protocol. + +First, we learned features using SimCLR on the ```STL10 unsupervised``` set. Then, we train a linear classifier on top of the frozen features from SimCLR. The linear model is trained on features extracted from the ```STL10 train``` set and evaluated on the ```STL10 test``` set. + +Check the [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/sthalles/SimCLR/blob/simclr-refactor/feature_eval/mini_batch_logistic_regression_evaluator.ipynb) notebook for reproducibility. + +Note that SimCLR benefits from **longer training**. + +| Linear Classification | Dataset | Feature Extractor | Architecture | Feature dimensionality | Projection Head dimensionality | Epochs | Top1 % | +|----------------------------|---------|-------------------|---------------------------------------------------------------------------------|------------------------|--------------------------------|--------|--------| +| Logistic Regression (Adam) | STL10 | SimCLR | [ResNet-18](https://drive.google.com/open?id=14_nH2FkyKbt61cieQDiSbBVNP8-gtwgF) | 512 | 128 | 100 | 74.45 | +| Logistic Regression (Adam) | CIFAR10 | SimCLR | [ResNet-18](https://drive.google.com/open?id=1lc2aoVtrAetGn0PnTkOyFzPCIucOJq7C) | 512 | 128 | 100 | 69.82 | +| Logistic Regression (Adam) | STL10 | SimCLR | [ResNet-50](https://drive.google.com/open?id=1ByTKAUsdm_X7tLcii6oAEl5qFRqRMZSu) | 2048 | 128 | 50 | 70.075 | diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/data_aug/contrastive_learning_dataset.py b/PuzzleTuning/Counterpart PreTrain Methods/simclr/data_aug/contrastive_learning_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..8fe8880e3e61723155e2b3366fe65d85bae4bb2f --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/data_aug/contrastive_learning_dataset.py @@ -0,0 +1,53 @@ +from torchvision.transforms import transforms +from data_aug.gaussian_blur import GaussianBlur +from torchvision import transforms, datasets +from data_aug.view_generator import ContrastiveLearningViewGenerator +from exceptions.exceptions import InvalidDatasetSelection +import os + + +class ContrastiveLearningDataset: + def __init__(self, root_folder): + self.root_folder = root_folder + + @staticmethod + def get_simclr_pipeline_transform(size, s=1): + """Return a set of data augmentation transformations as described in the SimCLR paper.""" + color_jitter = transforms.ColorJitter(0.8 * s, 0.8 * s, 0.8 * s, 0.2 * s) + data_transforms = transforms.Compose([transforms.RandomResizedCrop(size=size), + transforms.RandomHorizontalFlip(), + transforms.RandomApply([color_jitter], p=0.8), + transforms.RandomGrayscale(p=0.2), + GaussianBlur(kernel_size=int(0.1 * size)), + transforms.ToTensor()]) + return data_transforms + + def get_dataset(self, name, n_views, mode='train', img_size=224): + valid_datasets = {'cifar10': lambda: datasets.CIFAR10(self.root_folder, train=True, + transform=ContrastiveLearningViewGenerator( + self.get_simclr_pipeline_transform(img_size), + n_views), + download=True), + + 'stl10': lambda: datasets.STL10(self.root_folder, split='unlabeled', + transform=ContrastiveLearningViewGenerator( + self.get_simclr_pipeline_transform(img_size), + n_views), + download=True), + 'imagefolder': lambda: datasets.ImageFolder(os.path.join(self.root_folder, mode), + transform=ContrastiveLearningViewGenerator( + self.get_simclr_pipeline_transform(img_size), + n_views)), + 'cpia-mini': lambda: datasets.ImageFolder(self.root_folder, + transform=ContrastiveLearningViewGenerator( + self.get_simclr_pipeline_transform(img_size), + n_views))} + + + try: + print(f'dataset: {name}') + dataset_fn = valid_datasets[name] + except KeyError: + raise InvalidDatasetSelection() + else: + return dataset_fn() diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/data_aug/gaussian_blur.py b/PuzzleTuning/Counterpart PreTrain Methods/simclr/data_aug/gaussian_blur.py new file mode 100644 index 0000000000000000000000000000000000000000..e3ad45039790e5b96c101f3fcd21f4199330adbf --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/data_aug/gaussian_blur.py @@ -0,0 +1,48 @@ +import numpy as np +import torch +from torch import nn +from torchvision.transforms import transforms + +np.random.seed(0) + + +class GaussianBlur(object): + """blur a single image on CPU""" + def __init__(self, kernel_size): + radias = kernel_size // 2 + kernel_size = radias * 2 + 1 + self.blur_h = nn.Conv2d(3, 3, kernel_size=(kernel_size, 1), + stride=1, padding=0, bias=False, groups=3) + self.blur_v = nn.Conv2d(3, 3, kernel_size=(1, kernel_size), + stride=1, padding=0, bias=False, groups=3) + self.k = kernel_size + self.r = radias + + self.blur = nn.Sequential( + nn.ReflectionPad2d(radias), + self.blur_h, + self.blur_v + ) + + self.pil_to_tensor = transforms.ToTensor() + self.tensor_to_pil = transforms.ToPILImage() + + def __call__(self, img): + img = self.pil_to_tensor(img).unsqueeze(0) + + sigma = np.random.uniform(0.1, 2.0) + x = np.arange(-self.r, self.r + 1) + x = np.exp(-np.power(x, 2) / (2 * sigma * sigma)) + x = x / x.sum() + x = torch.from_numpy(x).view(1, -1).repeat(3, 1) + + self.blur_h.weight.data.copy_(x.view(3, 1, self.k, 1)) + self.blur_v.weight.data.copy_(x.view(3, 1, 1, self.k)) + + with torch.no_grad(): + img = self.blur(img) + img = img.squeeze() + + img = self.tensor_to_pil(img) + + return img \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/data_aug/view_generator.py b/PuzzleTuning/Counterpart PreTrain Methods/simclr/data_aug/view_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..aa4c4a5adb6d671bae28f9d1a5a82731bb9b2bdc --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/data_aug/view_generator.py @@ -0,0 +1,14 @@ +import numpy as np + +np.random.seed(0) + + +class ContrastiveLearningViewGenerator(object): + """Take two random crops of one image as the query and key.""" + + def __init__(self, base_transform, n_views=2): + self.base_transform = base_transform + self.n_views = n_views + + def __call__(self, x): + return [self.base_transform(x) for i in range(self.n_views)] diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/dataset_test.ipynb b/PuzzleTuning/Counterpart PreTrain Methods/simclr/dataset_test.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..68184fa79541cf88010d5618a767576b6bd13ce5 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/dataset_test.ipynb @@ -0,0 +1,312 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dataset: local\n" + ] + } + ], + "source": [ + "from data_aug.contrastive_learning_dataset import ContrastiveLearningDataset\n", + "import torch\n", + "\n", + "\n", + "dataset = ContrastiveLearningDataset('/data/ImageFolderLike/pRCC_CLS')\n", + "\n", + "train_dataset = dataset.get_dataset('local', n_views=2, mode='train', img_size=224)\n", + "\n", + "train_loader = torch.utils.data.DataLoader(\n", + " train_dataset, batch_size=1, shuffle=True, pin_memory=True, drop_last=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "img_list = []\n", + "for images, _ in train_loader:\n", + " img_list = images\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import torch\n", + "import torchvision.transforms as transforms\n", + "from PIL import Image\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# print(img_list[0].squeeze().shape)\n", + "plt.imshow(img_list[0].squeeze().permute(1, 2, 0))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.imshow(img_list[1].squeeze().permute(1, 2, 0))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "img_list = []\n", + "skip_cnt = 3\n", + "for images, _ in train_loader:\n", + " img_list = images\n", + " skip_cnt -= 1\n", + " if skip_cnt == 0:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.imshow(img_list[0].squeeze().permute(1, 2, 0))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.imshow(img_list[1].squeeze().permute(1, 2, 0))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dataset: stl10\n", + "Files already downloaded and verified\n" + ] + } + ], + "source": [ + "from data_aug.contrastive_learning_dataset import ContrastiveLearningDataset\n", + "import torch\n", + "\n", + "\n", + "dataset = ContrastiveLearningDataset('./datasets')\n", + "\n", + "train_dataset = dataset.get_dataset('stl10', n_views=2, mode='train', img_size=224)\n", + "\n", + "train_loader = torch.utils.data.DataLoader(\n", + " train_dataset, batch_size=1, shuffle=True, pin_memory=True, drop_last=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "img_list = []\n", + "skip_cnt = 123\n", + "for images, _ in train_loader:\n", + " img_list = images\n", + " skip_cnt -= 1\n", + " if skip_cnt == 0:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.imshow(img_list[0].squeeze().permute(1, 2, 0))" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.imshow(img_list[1].squeeze().permute(1, 2, 0))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16 | packaged by conda-forge | (default, Feb 1 2023, 16:01:55) \n[GCC 11.3.0]" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "53427a73dce6cd561a14bc57d038a34300a2a6ca5e8afe9c5deb1771232f2ff8" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/env.yml b/PuzzleTuning/Counterpart PreTrain Methods/simclr/env.yml new file mode 100644 index 0000000000000000000000000000000000000000..26a630558b418b486600b63eea7dba075104c182 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/env.yml @@ -0,0 +1,21 @@ +name: simclr +channels: + - pytorch + - anaconda + - conda-forge + - defaults +dependencies: + - cudatoolkit=10.1 + - numpy=1.18.1 + - opencv=3.4.2 + - pillow=7.0 + - pip=20.0 + - python=3.7.6 + - pytorch=1.4.0 + - torchvision=0.5 + - tensorboard=2.1 + - matplotlib=3.1.3 + - scikit-learn=0.22.1 + - pyyaml=5.3.1 + - nvidia-apex=0.1 + diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/exceptions/exceptions.py b/PuzzleTuning/Counterpart PreTrain Methods/simclr/exceptions/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..a7370841cd2d638c64d7f640809c024877848643 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/exceptions/exceptions.py @@ -0,0 +1,10 @@ +class BaseSimCLRException(Exception): + """Base exception""" + + +class InvalidBackboneError(BaseSimCLRException): + """Raised when the choice of backbone Convnet is invalid.""" + + +class InvalidDatasetSelection(BaseSimCLRException): + """Raised when the choice of dataset is invalid.""" diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/feature_eval/mini_batch_logistic_regression_evaluator.ipynb b/PuzzleTuning/Counterpart PreTrain Methods/simclr/feature_eval/mini_batch_logistic_regression_evaluator.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..8fb01077f967f27e11673bba6207e03617694252 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/feature_eval/mini_batch_logistic_regression_evaluator.ipynb @@ -0,0 +1,821 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "display_name": "pytorch", + "language": "python", + "name": "pytorch" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + }, + "colab": { + "name": "Copy of mini-batch-logistic-regression-evaluator.ipynb", + "provenance": [], + "include_colab_link": true + }, + "accelerator": "GPU", + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "149b9ce8fb68473a837a77431c12281a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "state": { + "_view_name": "HBoxView", + "_dom_classes": [], + "_model_name": "HBoxModel", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.5.0", + "box_style": "", + "layout": "IPY_MODEL_88cd3db2831e4c13a4a634709700d6b2", + "_model_module": "@jupyter-widgets/controls", + "children": [ + "IPY_MODEL_a88c31d74f5c40a2b24bcff5a35d216c", + "IPY_MODEL_60c6150177694717a622936b830427b5" + ] + } + }, + "88cd3db2831e4c13a4a634709700d6b2": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "a88c31d74f5c40a2b24bcff5a35d216c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "state": { + "_view_name": "ProgressView", + "style": "IPY_MODEL_dba019efadee4fdc8c799f309b9a7e70", + "_dom_classes": [], + "description": "", + "_model_name": "FloatProgressModel", + "bar_style": "info", + "max": 1, + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": 1, + "_view_count": null, + "_view_module_version": "1.5.0", + "orientation": "horizontal", + "min": 0, + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_5901c2829a554c8ebbd5926610088041" + } + }, + "60c6150177694717a622936b830427b5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "state": { + "_view_name": "HTMLView", + "style": "IPY_MODEL_957362a11d174407979cf17012bf9208", + "_dom_classes": [], + "description": "", + "_model_name": "HTMLModel", + "placeholder": "​", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": " 2640404480/? [00:51<00:00, 32685718.58it/s]", + "_view_count": null, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_a4f82234388e4701a02a9f68a177193a" + } + }, + "dba019efadee4fdc8c799f309b9a7e70": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "state": { + "_view_name": "StyleView", + "_model_name": "ProgressStyleModel", + "description_width": "initial", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "bar_color": null, + "_model_module": "@jupyter-widgets/controls" + } + }, + "5901c2829a554c8ebbd5926610088041": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "957362a11d174407979cf17012bf9208": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + } + }, + "a4f82234388e4701a02a9f68a177193a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "YUemQib7ZE4D" + }, + "source": [ + "import torch\n", + "import sys\n", + "import numpy as np\n", + "import os\n", + "import yaml\n", + "import matplotlib.pyplot as plt\n", + "import torchvision" + ], + "execution_count": 10, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "WSgRE1CcLqdS", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "48a2ae15-f672-495b-8d43-9a23b85fa3b8" + }, + "source": [ + "!pip install gdown" + ], + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Requirement already satisfied: gdown in /usr/local/lib/python3.6/dist-packages (3.6.4)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from gdown) (1.15.0)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from gdown) (2.23.0)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from gdown) (4.41.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->gdown) (2020.12.5)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->gdown) (1.24.3)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->gdown) (3.0.4)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->gdown) (2.10)\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "NOIJEui1ZziV" + }, + "source": [ + "def get_file_id_by_model(folder_name):\n", + " file_id = {'resnet18_100-epochs_stl10': '14_nH2FkyKbt61cieQDiSbBVNP8-gtwgF',\n", + " 'resnet18_100-epochs_cifar10': '1lc2aoVtrAetGn0PnTkOyFzPCIucOJq7C',\n", + " 'resnet50_50-epochs_stl10': '1ByTKAUsdm_X7tLcii6oAEl5qFRqRMZSu'}\n", + " return file_id.get(folder_name, \"Model not found.\")" + ], + "execution_count": 12, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "G7YMxsvEZMrX", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "59475430-69d2-45a2-b61b-ae755d5d6e88" + }, + "source": [ + "folder_name = 'resnet50_50-epochs_stl10'\n", + "file_id = get_file_id_by_model(folder_name)\n", + "print(folder_name, file_id)" + ], + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "text": [ + "resnet50_50-epochs_stl10 1ByTKAUsdm_X7tLcii6oAEl5qFRqRMZSu\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "PWZ8fet_YoJm", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "fbaeb858-221b-4d1b-dd90-001a6e713b75" + }, + "source": [ + "# download and extract model files\n", + "os.system('gdown https://drive.google.com/uc?id={}'.format(file_id))\n", + "os.system('unzip {}'.format(folder_name))\n", + "!ls" + ], + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "text": [ + "checkpoint_0040.pth.tar\n", + "config.yml\n", + "events.out.tfevents.1610927742.4cb2c837708d.2694093.0\n", + "resnet50_50-epochs_stl10.zip\n", + "sample_data\n", + "training.log\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "3_nypQVEv-hn" + }, + "source": [ + "from torch.utils.data import DataLoader\n", + "import torchvision.transforms as transforms\n", + "from torchvision import datasets" + ], + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "lDfbL3w_Z0Od", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "7532966e-1c4a-4641-c928-4cda14c53389" + }, + "source": [ + "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n", + "print(\"Using device:\", device)" + ], + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Using device: cuda\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "BfIPl0G6_RrT" + }, + "source": [ + "def get_stl10_data_loaders(download, shuffle=False, batch_size=256):\n", + " train_dataset = datasets.STL10('./data', split='train', download=download,\n", + " transform=transforms.ToTensor())\n", + "\n", + " train_loader = DataLoader(train_dataset, batch_size=batch_size,\n", + " num_workers=0, drop_last=False, shuffle=shuffle)\n", + " \n", + " test_dataset = datasets.STL10('./data', split='test', download=download,\n", + " transform=transforms.ToTensor())\n", + "\n", + " test_loader = DataLoader(test_dataset, batch_size=2*batch_size,\n", + " num_workers=10, drop_last=False, shuffle=shuffle)\n", + " return train_loader, test_loader\n", + "\n", + "def get_cifar10_data_loaders(download, shuffle=False, batch_size=256):\n", + " train_dataset = datasets.CIFAR10('./data', train=True, download=download,\n", + " transform=transforms.ToTensor())\n", + "\n", + " train_loader = DataLoader(train_dataset, batch_size=batch_size,\n", + " num_workers=0, drop_last=False, shuffle=shuffle)\n", + " \n", + " test_dataset = datasets.CIFAR10('./data', train=False, download=download,\n", + " transform=transforms.ToTensor())\n", + "\n", + " test_loader = DataLoader(test_dataset, batch_size=2*batch_size,\n", + " num_workers=10, drop_last=False, shuffle=shuffle)\n", + " return train_loader, test_loader" + ], + "execution_count": 17, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "6N8lYkbmDTaK" + }, + "source": [ + "with open(os.path.join('./config.yml')) as file:\n", + " config = yaml.load(file)" + ], + "execution_count": 18, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "a18lPD-tIle6" + }, + "source": [ + "if config.arch == 'resnet18':\n", + " model = torchvision.models.resnet18(pretrained=False, num_classes=10).to(device)\n", + "elif config.arch == 'resnet50':\n", + " model = torchvision.models.resnet50(pretrained=False, num_classes=10).to(device)" + ], + "execution_count": 19, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "4AIfgq41GuTT" + }, + "source": [ + "checkpoint = torch.load('checkpoint_0040.pth.tar', map_location=device)\n", + "state_dict = checkpoint['state_dict']\n", + "\n", + "for k in list(state_dict.keys()):\n", + "\n", + " if k.startswith('backbone.'):\n", + " if k.startswith('backbone') and not k.startswith('backbone.fc'):\n", + " # remove prefix\n", + " state_dict[k[len(\"backbone.\"):]] = state_dict[k]\n", + " del state_dict[k]" + ], + "execution_count": 21, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "VVjA83PPJYWl" + }, + "source": [ + "log = model.load_state_dict(state_dict, strict=False)\n", + "assert log.missing_keys == ['fc.weight', 'fc.bias']" + ], + "execution_count": 22, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "_GC0a14uWRr6", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 117, + "referenced_widgets": [ + "149b9ce8fb68473a837a77431c12281a", + "88cd3db2831e4c13a4a634709700d6b2", + "a88c31d74f5c40a2b24bcff5a35d216c", + "60c6150177694717a622936b830427b5", + "dba019efadee4fdc8c799f309b9a7e70", + "5901c2829a554c8ebbd5926610088041", + "957362a11d174407979cf17012bf9208", + "a4f82234388e4701a02a9f68a177193a" + ] + }, + "outputId": "4c2558db-921c-425e-f947-6cc746d8c749" + }, + "source": [ + "if config.dataset_name == 'cifar10':\n", + " train_loader, test_loader = get_cifar10_data_loaders(download=True)\n", + "elif config.dataset_name == 'stl10':\n", + " train_loader, test_loader = get_stl10_data_loaders(download=True)\n", + "print(\"Dataset:\", config.dataset_name)" + ], + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Downloading http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz to ./data/stl10_binary.tar.gz\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "149b9ce8fb68473a837a77431c12281a", + "version_minor": 0, + "version_major": 2 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "stream", + "text": [ + "Extracting ./data/stl10_binary.tar.gz to ./data\n", + "Files already downloaded and verified\n", + "Dataset: stl10\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "pYT_KsM0Mnnr" + }, + "source": [ + "# freeze all layers but the last fc\n", + "for name, param in model.named_parameters():\n", + " if name not in ['fc.weight', 'fc.bias']:\n", + " param.requires_grad = False\n", + "\n", + "parameters = list(filter(lambda p: p.requires_grad, model.parameters()))\n", + "assert len(parameters) == 2 # fc.weight, fc.bias" + ], + "execution_count": 24, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "aPVh1S_eMRDU" + }, + "source": [ + "optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=0.0008)\n", + "criterion = torch.nn.CrossEntropyLoss().to(device)" + ], + "execution_count": 25, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "edr6RhP2PdVq" + }, + "source": [ + "def accuracy(output, target, topk=(1,)):\n", + " \"\"\"Computes the accuracy over the k top predictions for the specified values of k\"\"\"\n", + " with torch.no_grad():\n", + " maxk = max(topk)\n", + " batch_size = target.size(0)\n", + "\n", + " _, pred = output.topk(maxk, 1, True, True)\n", + " pred = pred.t()\n", + " correct = pred.eq(target.view(1, -1).expand_as(pred))\n", + "\n", + " res = []\n", + " for k in topk:\n", + " correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)\n", + " res.append(correct_k.mul_(100.0 / batch_size))\n", + " return res" + ], + "execution_count": 26, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "qOder0dAMI7X", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "5f723b91-5a5e-43eb-ca01-a9b5ae2f1346" + }, + "source": [ + "epochs = 100\n", + "for epoch in range(epochs):\n", + " top1_train_accuracy = 0\n", + " for counter, (x_batch, y_batch) in enumerate(train_loader):\n", + " x_batch = x_batch.to(device)\n", + " y_batch = y_batch.to(device)\n", + "\n", + " logits = model(x_batch)\n", + " loss = criterion(logits, y_batch)\n", + " top1 = accuracy(logits, y_batch, topk=(1,))\n", + " top1_train_accuracy += top1[0]\n", + "\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " top1_train_accuracy /= (counter + 1)\n", + " top1_accuracy = 0\n", + " top5_accuracy = 0\n", + " for counter, (x_batch, y_batch) in enumerate(test_loader):\n", + " x_batch = x_batch.to(device)\n", + " y_batch = y_batch.to(device)\n", + "\n", + " logits = model(x_batch)\n", + " \n", + " top1, top5 = accuracy(logits, y_batch, topk=(1,5))\n", + " top1_accuracy += top1[0]\n", + " top5_accuracy += top5[0]\n", + " \n", + " top1_accuracy /= (counter + 1)\n", + " top5_accuracy /= (counter + 1)\n", + " print(f\"Epoch {epoch}\\tTop1 Train accuracy {top1_train_accuracy.item()}\\tTop1 Test accuracy: {top1_accuracy.item()}\\tTop5 test acc: {top5_accuracy.item()}\")" + ], + "execution_count": 27, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Epoch 0\tTop1 Train accuracy 28.7109375\tTop1 Test accuracy: 43.75\tTop5 test acc: 93.837890625\n", + "Epoch 1\tTop1 Train accuracy 49.37959671020508\tTop1 Test accuracy: 52.8662109375\tTop5 test acc: 95.439453125\n", + "Epoch 2\tTop1 Train accuracy 55.257354736328125\tTop1 Test accuracy: 56.45263671875\tTop5 test acc: 95.91796875\n", + "Epoch 3\tTop1 Train accuracy 57.51838302612305\tTop1 Test accuracy: 57.39013671875\tTop5 test acc: 96.19384765625\n", + "Epoch 4\tTop1 Train accuracy 58.727020263671875\tTop1 Test accuracy: 58.2568359375\tTop5 test acc: 96.435546875\n", + "Epoch 5\tTop1 Train accuracy 59.677162170410156\tTop1 Test accuracy: 58.7353515625\tTop5 test acc: 96.50390625\n", + "Epoch 6\tTop1 Train accuracy 60.065486907958984\tTop1 Test accuracy: 59.17724609375\tTop5 test acc: 96.708984375\n", + "Epoch 7\tTop1 Train accuracy 60.612361907958984\tTop1 Test accuracy: 59.482421875\tTop5 test acc: 96.74560546875\n", + "Epoch 8\tTop1 Train accuracy 60.827205657958984\tTop1 Test accuracy: 59.66064453125\tTop5 test acc: 96.77490234375\n", + "Epoch 9\tTop1 Train accuracy 61.100643157958984\tTop1 Test accuracy: 60.09521484375\tTop5 test acc: 96.82373046875\n", + "Epoch 10\tTop1 Train accuracy 61.52803421020508\tTop1 Test accuracy: 60.3466796875\tTop5 test acc: 96.82861328125\n", + "Epoch 11\tTop1 Train accuracy 61.80147171020508\tTop1 Test accuracy: 60.6640625\tTop5 test acc: 96.8896484375\n", + "Epoch 12\tTop1 Train accuracy 62.09444046020508\tTop1 Test accuracy: 60.96435546875\tTop5 test acc: 96.99462890625\n", + "Epoch 13\tTop1 Train accuracy 62.541358947753906\tTop1 Test accuracy: 61.13037109375\tTop5 test acc: 97.0068359375\n", + "Epoch 14\tTop1 Train accuracy 62.853858947753906\tTop1 Test accuracy: 61.34033203125\tTop5 test acc: 97.01904296875\n", + "Epoch 15\tTop1 Train accuracy 62.951515197753906\tTop1 Test accuracy: 61.5673828125\tTop5 test acc: 96.99951171875\n", + "Epoch 16\tTop1 Train accuracy 63.400733947753906\tTop1 Test accuracy: 61.806640625\tTop5 test acc: 97.0361328125\n", + "Epoch 17\tTop1 Train accuracy 63.66958236694336\tTop1 Test accuracy: 61.98974609375\tTop5 test acc: 97.0849609375\n", + "Epoch 18\tTop1 Train accuracy 63.82583236694336\tTop1 Test accuracy: 62.265625\tTop5 test acc: 97.07275390625\n", + "Epoch 19\tTop1 Train accuracy 64.1187973022461\tTop1 Test accuracy: 62.412109375\tTop5 test acc: 97.09716796875\n", + "Epoch 20\tTop1 Train accuracy 64.2750473022461\tTop1 Test accuracy: 62.56591796875\tTop5 test acc: 97.12158203125\n", + "Epoch 21\tTop1 Train accuracy 64.4140625\tTop1 Test accuracy: 62.724609375\tTop5 test acc: 97.20703125\n", + "Epoch 22\tTop1 Train accuracy 64.53125\tTop1 Test accuracy: 62.90771484375\tTop5 test acc: 97.255859375\n", + "Epoch 23\tTop1 Train accuracy 64.6484375\tTop1 Test accuracy: 62.95654296875\tTop5 test acc: 97.29248046875\n", + "Epoch 24\tTop1 Train accuracy 64.86328125\tTop1 Test accuracy: 63.12255859375\tTop5 test acc: 97.35595703125\n", + "Epoch 25\tTop1 Train accuracy 65.1344223022461\tTop1 Test accuracy: 63.330078125\tTop5 test acc: 97.40478515625\n", + "Epoch 26\tTop1 Train accuracy 65.3297348022461\tTop1 Test accuracy: 63.3984375\tTop5 test acc: 97.44873046875\n", + "Epoch 27\tTop1 Train accuracy 65.4469223022461\tTop1 Test accuracy: 63.34228515625\tTop5 test acc: 97.412109375\n", + "Epoch 28\tTop1 Train accuracy 65.6227035522461\tTop1 Test accuracy: 63.48876953125\tTop5 test acc: 97.412109375\n", + "Epoch 29\tTop1 Train accuracy 65.85478210449219\tTop1 Test accuracy: 63.56201171875\tTop5 test acc: 97.42431640625\n", + "Epoch 30\tTop1 Train accuracy 66.06732940673828\tTop1 Test accuracy: 63.67431640625\tTop5 test acc: 97.4560546875\n", + "Epoch 31\tTop1 Train accuracy 66.20404815673828\tTop1 Test accuracy: 63.80859375\tTop5 test acc: 97.48046875\n", + "Epoch 32\tTop1 Train accuracy 66.24080657958984\tTop1 Test accuracy: 63.92578125\tTop5 test acc: 97.5048828125\n", + "Epoch 33\tTop1 Train accuracy 66.58777618408203\tTop1 Test accuracy: 63.9990234375\tTop5 test acc: 97.529296875\n", + "Epoch 34\tTop1 Train accuracy 66.70496368408203\tTop1 Test accuracy: 64.1455078125\tTop5 test acc: 97.51708984375\n", + "Epoch 35\tTop1 Train accuracy 66.80261993408203\tTop1 Test accuracy: 64.20654296875\tTop5 test acc: 97.529296875\n", + "Epoch 36\tTop1 Train accuracy 66.91980743408203\tTop1 Test accuracy: 64.32861328125\tTop5 test acc: 97.51708984375\n", + "Epoch 37\tTop1 Train accuracy 66.93933868408203\tTop1 Test accuracy: 64.3896484375\tTop5 test acc: 97.51708984375\n", + "Epoch 38\tTop1 Train accuracy 66.97840118408203\tTop1 Test accuracy: 64.47021484375\tTop5 test acc: 97.529296875\n", + "Epoch 39\tTop1 Train accuracy 67.11282348632812\tTop1 Test accuracy: 64.53125\tTop5 test acc: 97.56591796875\n", + "Epoch 40\tTop1 Train accuracy 67.24954223632812\tTop1 Test accuracy: 64.6044921875\tTop5 test acc: 97.6025390625\n", + "Epoch 41\tTop1 Train accuracy 67.34949493408203\tTop1 Test accuracy: 64.62890625\tTop5 test acc: 97.59033203125\n", + "Epoch 42\tTop1 Train accuracy 67.42761993408203\tTop1 Test accuracy: 64.7265625\tTop5 test acc: 97.6025390625\n", + "Epoch 43\tTop1 Train accuracy 67.52527618408203\tTop1 Test accuracy: 64.84375\tTop5 test acc: 97.61474609375\n", + "Epoch 44\tTop1 Train accuracy 67.58386993408203\tTop1 Test accuracy: 64.87548828125\tTop5 test acc: 97.61474609375\n", + "Epoch 45\tTop1 Train accuracy 67.64246368408203\tTop1 Test accuracy: 64.9365234375\tTop5 test acc: 97.626953125\n", + "Epoch 46\tTop1 Train accuracy 67.75735473632812\tTop1 Test accuracy: 65.0341796875\tTop5 test acc: 97.66357421875\n", + "Epoch 47\tTop1 Train accuracy 67.85501098632812\tTop1 Test accuracy: 65.1318359375\tTop5 test acc: 97.7001953125\n", + "Epoch 48\tTop1 Train accuracy 67.89407348632812\tTop1 Test accuracy: 65.1318359375\tTop5 test acc: 97.73681640625\n", + "Epoch 49\tTop1 Train accuracy 67.95266723632812\tTop1 Test accuracy: 65.15625\tTop5 test acc: 97.73681640625\n", + "Epoch 50\tTop1 Train accuracy 68.01126098632812\tTop1 Test accuracy: 65.21728515625\tTop5 test acc: 97.76123046875\n", + "Epoch 51\tTop1 Train accuracy 68.05032348632812\tTop1 Test accuracy: 65.29052734375\tTop5 test acc: 97.7490234375\n", + "Epoch 52\tTop1 Train accuracy 68.05032348632812\tTop1 Test accuracy: 65.3564453125\tTop5 test acc: 97.78564453125\n", + "Epoch 53\tTop1 Train accuracy 68.20657348632812\tTop1 Test accuracy: 65.3759765625\tTop5 test acc: 97.7978515625\n", + "Epoch 54\tTop1 Train accuracy 68.28469848632812\tTop1 Test accuracy: 65.45654296875\tTop5 test acc: 97.822265625\n", + "Epoch 55\tTop1 Train accuracy 68.41912078857422\tTop1 Test accuracy: 65.46875\tTop5 test acc: 97.8466796875\n", + "Epoch 56\tTop1 Train accuracy 68.45818328857422\tTop1 Test accuracy: 65.5615234375\tTop5 test acc: 97.85888671875\n", + "Epoch 57\tTop1 Train accuracy 68.61443328857422\tTop1 Test accuracy: 65.56640625\tTop5 test acc: 97.87109375\n", + "Epoch 58\tTop1 Train accuracy 68.71208953857422\tTop1 Test accuracy: 65.5859375\tTop5 test acc: 97.90771484375\n", + "Epoch 59\tTop1 Train accuracy 68.69255828857422\tTop1 Test accuracy: 65.64697265625\tTop5 test acc: 97.919921875\n", + "Epoch 60\tTop1 Train accuracy 68.80744934082031\tTop1 Test accuracy: 65.64697265625\tTop5 test acc: 97.93212890625\n", + "Epoch 61\tTop1 Train accuracy 68.94416809082031\tTop1 Test accuracy: 65.72021484375\tTop5 test acc: 97.93212890625\n", + "Epoch 62\tTop1 Train accuracy 69.04182434082031\tTop1 Test accuracy: 65.76904296875\tTop5 test acc: 97.919921875\n", + "Epoch 63\tTop1 Train accuracy 69.06135559082031\tTop1 Test accuracy: 65.84228515625\tTop5 test acc: 97.90771484375\n", + "Epoch 64\tTop1 Train accuracy 69.19807434082031\tTop1 Test accuracy: 65.93505859375\tTop5 test acc: 97.90771484375\n", + "Epoch 65\tTop1 Train accuracy 69.23713684082031\tTop1 Test accuracy: 65.95947265625\tTop5 test acc: 97.9150390625\n", + "Epoch 66\tTop1 Train accuracy 69.25666809082031\tTop1 Test accuracy: 66.0888671875\tTop5 test acc: 97.939453125\n", + "Epoch 67\tTop1 Train accuracy 69.31526184082031\tTop1 Test accuracy: 66.02783203125\tTop5 test acc: 97.939453125\n", + "Epoch 68\tTop1 Train accuracy 69.43014526367188\tTop1 Test accuracy: 66.07666015625\tTop5 test acc: 97.9638671875\n", + "Epoch 69\tTop1 Train accuracy 69.48873901367188\tTop1 Test accuracy: 66.12060546875\tTop5 test acc: 97.9638671875\n", + "Epoch 70\tTop1 Train accuracy 69.50827026367188\tTop1 Test accuracy: 66.083984375\tTop5 test acc: 97.95166015625\n", + "Epoch 71\tTop1 Train accuracy 69.60592651367188\tTop1 Test accuracy: 66.1572265625\tTop5 test acc: 97.9638671875\n", + "Epoch 72\tTop1 Train accuracy 69.68635559082031\tTop1 Test accuracy: 66.2060546875\tTop5 test acc: 97.95166015625\n", + "Epoch 73\tTop1 Train accuracy 69.78170776367188\tTop1 Test accuracy: 66.2744140625\tTop5 test acc: 97.92724609375\n", + "Epoch 74\tTop1 Train accuracy 69.84030151367188\tTop1 Test accuracy: 66.31591796875\tTop5 test acc: 97.92724609375\n", + "Epoch 75\tTop1 Train accuracy 69.89889526367188\tTop1 Test accuracy: 66.328125\tTop5 test acc: 97.9150390625\n", + "Epoch 76\tTop1 Train accuracy 69.93795776367188\tTop1 Test accuracy: 66.41357421875\tTop5 test acc: 97.92724609375\n", + "Epoch 77\tTop1 Train accuracy 69.95748901367188\tTop1 Test accuracy: 66.41357421875\tTop5 test acc: 97.9150390625\n", + "Epoch 78\tTop1 Train accuracy 70.01608276367188\tTop1 Test accuracy: 66.474609375\tTop5 test acc: 97.9150390625\n", + "Epoch 79\tTop1 Train accuracy 69.99655151367188\tTop1 Test accuracy: 66.53564453125\tTop5 test acc: 97.939453125\n", + "Epoch 80\tTop1 Train accuracy 70.01608276367188\tTop1 Test accuracy: 66.56005859375\tTop5 test acc: 97.939453125\n", + "Epoch 81\tTop1 Train accuracy 70.09420776367188\tTop1 Test accuracy: 66.56494140625\tTop5 test acc: 97.939453125\n", + "Epoch 82\tTop1 Train accuracy 70.11373901367188\tTop1 Test accuracy: 66.650390625\tTop5 test acc: 97.939453125\n", + "Epoch 83\tTop1 Train accuracy 70.19186401367188\tTop1 Test accuracy: 66.71142578125\tTop5 test acc: 97.92724609375\n", + "Epoch 84\tTop1 Train accuracy 70.26998901367188\tTop1 Test accuracy: 66.7236328125\tTop5 test acc: 97.90283203125\n", + "Epoch 85\tTop1 Train accuracy 70.32858276367188\tTop1 Test accuracy: 66.73583984375\tTop5 test acc: 97.90283203125\n", + "Epoch 86\tTop1 Train accuracy 70.32858276367188\tTop1 Test accuracy: 66.748046875\tTop5 test acc: 97.890625\n", + "Epoch 87\tTop1 Train accuracy 70.46530151367188\tTop1 Test accuracy: 66.7724609375\tTop5 test acc: 97.890625\n", + "Epoch 88\tTop1 Train accuracy 70.52389526367188\tTop1 Test accuracy: 66.78466796875\tTop5 test acc: 97.90283203125\n", + "Epoch 89\tTop1 Train accuracy 70.56295776367188\tTop1 Test accuracy: 66.78466796875\tTop5 test acc: 97.890625\n", + "Epoch 90\tTop1 Train accuracy 70.68014526367188\tTop1 Test accuracy: 66.83349609375\tTop5 test acc: 97.87841796875\n", + "Epoch 91\tTop1 Train accuracy 70.77780151367188\tTop1 Test accuracy: 66.826171875\tTop5 test acc: 97.87841796875\n", + "Epoch 92\tTop1 Train accuracy 70.81686401367188\tTop1 Test accuracy: 66.88720703125\tTop5 test acc: 97.87841796875\n", + "Epoch 93\tTop1 Train accuracy 70.85592651367188\tTop1 Test accuracy: 66.8994140625\tTop5 test acc: 97.87841796875\n", + "Epoch 94\tTop1 Train accuracy 70.91452026367188\tTop1 Test accuracy: 66.9482421875\tTop5 test acc: 97.890625\n", + "Epoch 95\tTop1 Train accuracy 71.03170776367188\tTop1 Test accuracy: 66.98486328125\tTop5 test acc: 97.890625\n", + "Epoch 96\tTop1 Train accuracy 71.09030151367188\tTop1 Test accuracy: 67.001953125\tTop5 test acc: 97.91015625\n", + "Epoch 97\tTop1 Train accuracy 71.09030151367188\tTop1 Test accuracy: 67.0263671875\tTop5 test acc: 97.91015625\n", + "Epoch 98\tTop1 Train accuracy 71.12936401367188\tTop1 Test accuracy: 67.06298828125\tTop5 test acc: 97.89794921875\n", + "Epoch 99\tTop1 Train accuracy 71.12936401367188\tTop1 Test accuracy: 67.0751953125\tTop5 test acc: 97.8857421875\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "dtYqHZirMNZk" + }, + "source": [ + "" + ], + "execution_count": 27, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/load_vit_from_ckpt.py b/PuzzleTuning/Counterpart PreTrain Methods/simclr/load_vit_from_ckpt.py new file mode 100644 index 0000000000000000000000000000000000000000..eae311f5f0a67864f435a4f6495ae1bdb269f612 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/load_vit_from_ckpt.py @@ -0,0 +1,122 @@ +""" +Extracting backbone from a specified SimCLR checkpoint. + +Example: + +python load_vit_from_ckpt.py \ + --checkpoint ./runs/Aug13_10-31-32_lsq/checkpoint_0016.pth.tar \ + --save-to ./output \ + --save-name vit_simclr_16_224.pth \ + --num-classes 2 +""" + +import torchvision +import torch +import os +import argparse +from timm import create_model +# from net.models.vit import VisionTransformer + + +def gen_basic_weight(save_dir): + # Load timm vit weight + model = create_model('vit_base_patch16_224', pretrained=False, in_chans=3) + random_state_dict = model.state_dict() + + model = create_model('vit_base_patch16_224', pretrained=True, in_chans=3) + pretrained_state_dict = model.state_dict() + + # Save model + print(f'Saving backbone init weight to {save_dir}...') + if not os.path.exists(save_dir): + os.makedirs(save_dir) + torch.save(random_state_dict, os.path.join(save_dir, 'ViT_b16_224_Random_Init.pth')) + torch.save(pretrained_state_dict, os.path.join(save_dir, 'ViT_b16_224_Imagenet.pth')) + + +def main(args): + """Read ViT parameters from BYOL backbone + """ + + # Initialize model + if args.basic_weight: + model = create_model('vit_base_patch16_224', pretrained=False, in_chans=3) + # model = VisionTransformer(num_classes=args.num_classes) + + # Load basic weights (default initial parameters) + basic_weight = torch.load(args.basic_weight) + model.load_state_dict(basic_weight, strict=False) + else: + raise + model = create_model('vit_base_patch16_224', pretrained=True, in_chans=3) + + # Load checkpoint + # state_dict = torch.load(args.checkpoint)['state_dict'] + checkpoint = torch.load(args.checkpoint) + ckp_state_dict = checkpoint['state_dict'] + model_state_dict = model.state_dict() + + print('checking checkpoint weights...') + len_state_dict = len(ckp_state_dict) + for seq, src_k in enumerate(ckp_state_dict.keys()): + if "module.backbone." in src_k: + tgt_k = str(src_k).replace("module.backbone.", "") + if tgt_k not in model_state_dict.keys(): + print(f'{seq+1}/{len_state_dict} Skipped: {src_k}, {ckp_state_dict[src_k].shape}') + + print('loading weights...') + len_state_dict = len(model_state_dict) + for seq, tgt_k in enumerate(model_state_dict.keys()): + src_k = "module.backbone." + str(tgt_k) + if src_k in ckp_state_dict: + model_state_dict[tgt_k] = ckp_state_dict[src_k] + else: + print(f'{seq+1}/{len_state_dict} Skipped: {tgt_k}') + + model.load_state_dict(model_state_dict, strict=False) + + # Save model + print(f'Saving model to {args.save_to}...') + if not os.path.exists(args.save_to): + os.makedirs(args.save_to) + torch.save(model.state_dict(), os.path.join(args.save_to, args.save_name)) + + +def get_args_parser(): + """Input parameters + """ + parser = argparse.ArgumentParser(description='Extract backbone state dict') + parser.add_argument('--checkpoint', default='./checkpoint_0004.pth.tar', type=str, required=True, + help='Path to the checkpoint') + parser.add_argument('--save-to', default='./output', type=str, required=True, + help='Where to save the model') + parser.add_argument('--save-name', default='vit_simclr_16_224.pth', type=str, required=True, + help='Model save name') + parser.add_argument('--num-classes', default=2, type=int, + help='Number of classes to be classified') + parser.add_argument('--random-seed', default=42, type=int, + help='Random seed (enable reproduction)') + parser.add_argument('--basic-weight', default='', type=str, + help='Basic weight (used to init parameters)') + return parser + + +def setup_seed(seed): + """Fix up the random seed + + Args: + seed (int): Seed to be applied + """ + import random + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + random.seed(seed) + torch.backends.cudnn.deterministic = True + + +if __name__ == '__main__': + parser = get_args_parser() + args = parser.parse_args() + + setup_seed(args.random_seed) + main(args) \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/models/__pycache__/resnet_simclr.cpython-38.pyc b/PuzzleTuning/Counterpart PreTrain Methods/simclr/models/__pycache__/resnet_simclr.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f6ba59f4f9d872588a736ce381a9475418e73c3 Binary files /dev/null and b/PuzzleTuning/Counterpart PreTrain Methods/simclr/models/__pycache__/resnet_simclr.cpython-38.pyc differ diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/models/resnet_simclr.py b/PuzzleTuning/Counterpart PreTrain Methods/simclr/models/resnet_simclr.py new file mode 100644 index 0000000000000000000000000000000000000000..08f09ef7771ed5862cf3cd46980c23265aaed97b --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/models/resnet_simclr.py @@ -0,0 +1,70 @@ +import torch.nn as nn +import torchvision.models as models + +from exceptions.exceptions import InvalidBackboneError + +from timm import create_model +import torch +import logging + + +class ResNetSimCLR(nn.Module): + + def __init__(self, base_model, out_dim): + super(ResNetSimCLR, self).__init__() + self.resnet_dict = {"resnet18": models.resnet18(pretrained=False, num_classes=out_dim), + "resnet50": models.resnet50(pretrained=False, num_classes=out_dim)} + + self.backbone = self._get_basemodel(base_model) + dim_mlp = self.backbone.fc.in_features + + # add mlp projection head + self.backbone.fc = nn.Sequential(nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.backbone.fc) + + def _get_basemodel(self, model_name): + try: + model = self.resnet_dict[model_name] + except KeyError: + raise InvalidBackboneError( + "Invalid backbone architecture. Check the config file and pass one of: resnet18 or resnet50") + else: + return model + + def forward(self, x): + return self.backbone(x) + + +class ViTSimCLR(nn.Module): + + def __init__(self, base_model, out_dim, load_weight=None): + super(ViTSimCLR, self).__init__() + + # logging.info("=> preparing for backbone model '{}'".format(args.model)) + # backbone_model = create_model('vit_base_patch16_224', pretrained=args.pretrained, in_chans=3) + # if args.model_weights: + # model_weights = torch.load(args.model_weights) + # backbone_model.load_state_dict(model_weights, strict=True) + # logging.info(f"Loaded weights from: {args.model_weights}") + + assert 'vit' in base_model + backbone_model = create_model(base_model, pretrained=True, in_chans=3, num_classes=out_dim) + + # if load_weight: + # model_weights = torch.load(load_weight)['state_dict'] + # updated_weights = {key: value for key, value in model_weights.items() if not key.startswith('head')} + # backbone_model.load_state_dict(updated_weights, strict=False) + # optimizer.load_state_dict(checkpoint['optimizer']) + # lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) + # print(f"Loaded weights from: {load_weight}") + # logging.info(f"Loaded weights from: {load_weight}") + + self.backbone = backbone_model + + dim_mlp = self.backbone.head.in_features + + # add mlp projection head + self.backbone.head = nn.Sequential(nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.backbone.head) + + + def forward(self, x): + return self.backbone(x) diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/pretrain.sh b/PuzzleTuning/Counterpart PreTrain Methods/simclr/pretrain.sh new file mode 100644 index 0000000000000000000000000000000000000000..e04017ba87adb04a0978cf35bcafc7d3c265a4e5 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/pretrain.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# ps -ef | grep simclr | awk '{print $2}' |xargs kill + +# Training settings +pretrain_model="timm" +dataset="All" +model_weights="/home/pancreatic-cancer-diagnosis-tansformer/saved_models/ViT_b16_224_Imagenet.pth" + +# Init params +data_path="/root/autodl-tmp/datasets/${dataset}" +model_name="ViT_b16_224_timm_SIMCLR_ALL_100.pth" +checkpoint_path="/root/autodl-tmp/LSQ-simclr/checkpoint/${pretrain_model}" +save_weight_path="/root/autodl-tmp/LSQ-simclr/model_saved/" +tensorboard_path="/root/tf-logs/" + +# Training. Save checkpoint every 20 epochs. +# The checkpoint and backbone model will be available under checkpoint_path folder. +set -e + +python -u run_vit.py \ + --data $data_path \ + --dataset-name "cpia-mini" \ + --output_dir $checkpoint_path \ + --log_dir $tensorboard_path \ + --arch vit_base_patch16_224 \ + --batch_size 512 \ + --epochs 100 \ + --seed 42 \ + --fp16-precision \ + --init_weight_pth $model_weights \ + --enable_notify + +# extract & save model +python -u load_vit_from_ckpt.py \ + --basic-weight ${model_weights} \ + --checkpoint ${checkpoint_path}/checkpoint_0100.pth.tar \ + --save-to $save_weight_path \ + --save-name $model_name \ + --num-classes 2 + +set +e \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/reproduce_env/requirements.txt b/PuzzleTuning/Counterpart PreTrain Methods/simclr/reproduce_env/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2722afcef58cc96f375503bfb854f7e9b9501915 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/reproduce_env/requirements.txt @@ -0,0 +1,8 @@ +numpy==1.23.0 +numpy==1.23.5 +PyYAML==6.0 +PyYAML==6.0.1 +timm==0.6.12 +torch==2.0.0 +torchvision==0.15.0 +tqdm==4.65.0 diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/reproduce_env/simmim.yaml b/PuzzleTuning/Counterpart PreTrain Methods/simclr/reproduce_env/simmim.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a2abc17703b80fa839d606bf33a53c7ac1566bb9 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/reproduce_env/simmim.yaml @@ -0,0 +1,220 @@ +name: SimMIM +channels: + - pytorch + - nvidia + - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/ + - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/ + - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - asttokens=2.0.5=pyhd3eb1b0_0 + - backcall=0.2.0=pyhd3eb1b0_0 + - blas=1.0=mkl + - brotlipy=0.7.0=py39h27cfd23_1003 + - bzip2=1.0.8=h7b6447c_0 + - ca-certificates=2023.01.10=h06a4308_0 + - certifi=2022.12.7=py39h06a4308_0 + - cffi=1.15.1=py39h5eee18b_3 + - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - comm=0.1.2=py39h06a4308_0 + - cryptography=39.0.1=py39h9ce1e76_0 + - cuda-cudart=11.7.99=0 + - cuda-cupti=11.7.101=0 + - cuda-libraries=11.7.1=0 + - cuda-nvrtc=11.7.99=0 + - cuda-nvtx=11.7.91=0 + - cuda-runtime=11.7.1=0 + - debugpy=1.5.1=py39h295c915_0 + - decorator=5.1.1=pyhd3eb1b0_0 + - entrypoints=0.4=py39h06a4308_0 + - executing=0.8.3=pyhd3eb1b0_0 + - ffmpeg=4.2.2=h20bf706_0 + - filelock=3.9.0=py39h06a4308_0 + - flit-core=3.8.0=py39h06a4308_0 + - freetype=2.12.1=h4a9f257_0 + - giflib=5.2.1=h5eee18b_3 + - gmp=6.2.1=h295c915_3 + - gmpy2=2.1.2=py39heeb90bb_0 + - gnutls=3.6.15=he1e5248_0 + - idna=3.4=py39h06a4308_0 + - ipykernel=6.19.2=py39hb070fc8_0 + - ipython=8.10.0=py39h06a4308_0 + - jedi=0.18.1=py39h06a4308_1 + - jinja2=3.1.2=py39h06a4308_0 + - jpeg=9e=h5eee18b_1 + - jupyter_client=7.4.9=py39h06a4308_0 + - jupyter_core=5.2.0=py39h06a4308_0 + - lame=3.100=h7b6447c_0 + - lcms2=2.12=h3be6417_0 + - ld_impl_linux-64=2.38=h1181459_1 + - lerc=3.0=h295c915_0 + - libcublas=11.10.3.66=0 + - libcufft=10.7.2.124=h4fbf590_0 + - libcufile=1.6.0.25=0 + - libcurand=10.3.2.56=0 + - libcusolver=11.4.0.1=0 + - libcusparse=11.7.4.91=0 + - libdeflate=1.17=h5eee18b_0 + - libffi=3.4.2=h6a678d5_6 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libidn2=2.3.2=h7f8727e_0 + - libnpp=11.7.4.75=0 + - libnvjpeg=11.8.0.2=0 + - libopus=1.3.1=h7b6447c_0 + - libpng=1.6.39=h5eee18b_0 + - libsodium=1.0.18=h7b6447c_0 + - libstdcxx-ng=11.2.0=h1234567_1 + - libtasn1=4.16.0=h27cfd23_0 + - libtiff=4.5.0=h6a678d5_2 + - libunistring=0.9.10=h27cfd23_0 + - libvpx=1.7.0=h439df22_0 + - libwebp=1.2.4=h11a3e52_1 + - libwebp-base=1.2.4=h5eee18b_1 + - lz4-c=1.9.4=h6a678d5_0 + - markupsafe=2.1.1=py39h7f8727e_0 + - matplotlib-inline=0.1.6=py39h06a4308_0 + - mkl-service=2.4.0=py39h7f8727e_0 + - mkl_fft=1.3.1=py39hd3c417c_0 + - mkl_random=1.2.2=py39h51133e4_0 + - mpc=1.1.0=h10f8cd9_1 + - mpfr=4.0.2=hb69a4c5_1 + - mpmath=1.2.1=py39h06a4308_0 + - ncurses=6.4=h6a678d5_0 + - nest-asyncio=1.5.6=py39h06a4308_0 + - nettle=3.7.3=hbbd107a_1 + - networkx=2.8.4=py39h06a4308_1 + - numpy-base=1.23.5=py39h31eccc5_0 + - openh264=2.1.1=h4ff587b_0 + - openssl=1.1.1t=h7f8727e_0 + - packaging=23.0=py39h06a4308_0 + - parso=0.8.3=pyhd3eb1b0_0 + - pexpect=4.8.0=pyhd3eb1b0_3 + - pickleshare=0.7.5=pyhd3eb1b0_1003 + - pillow=9.4.0=py39h6a678d5_0 + - pip=23.0.1=py39h06a4308_0 + - platformdirs=2.5.2=py39h06a4308_0 + - prompt-toolkit=3.0.36=py39h06a4308_0 + - psutil=5.9.0=py39h5eee18b_0 + - ptyprocess=0.7.0=pyhd3eb1b0_2 + - pure_eval=0.2.2=pyhd3eb1b0_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pygments=2.11.2=pyhd3eb1b0_0 + - pyopenssl=23.0.0=py39h06a4308_0 + - pysocks=1.7.1=py39h06a4308_0 + - python=3.9.16=h7a1cb2a_2 + - python-dateutil=2.8.2=pyhd3eb1b0_0 + - pytorch=2.0.0=py3.9_cuda11.7_cudnn8.5.0_0 + - pytorch-cuda=11.7=h778d358_3 + - pytorch-mutex=1.0=cuda + - pyzmq=23.2.0=py39h6a678d5_0 + - readline=8.2=h5eee18b_0 + - requests=2.28.1=py39h06a4308_1 + - setuptools=65.6.3=py39h06a4308_0 + - six=1.16.0=pyhd3eb1b0_1 + - sqlite=3.41.1=h5eee18b_0 + - stack_data=0.2.0=pyhd3eb1b0_0 + - sympy=1.11.1=py39h06a4308_0 + - tk=8.6.12=h1ccaba5_0 + - torchaudio=2.0.0=py39_cu117 + - torchtriton=2.0.0=py39 + - torchvision=0.15.0=py39_cu117 + - tornado=6.2=py39h5eee18b_0 + - traitlets=5.7.1=py39h06a4308_0 + - typing_extensions=4.4.0=py39h06a4308_0 + - tzdata=2022g=h04d1e81_0 + - urllib3=1.26.14=py39h06a4308_0 + - wcwidth=0.2.5=pyhd3eb1b0_0 + - wheel=0.38.4=py39h06a4308_0 + - x264=1!157.20191217=h7b6447c_0 + - xz=5.2.10=h5eee18b_1 + - zeromq=4.3.4=h2531618_0 + - zlib=1.2.13=h5eee18b_0 + - zstd=1.5.2=ha4553b6_0 + - pip: + - absl-py==1.4.0 + - aiohttp==3.8.4 + - aiosignal==1.3.1 + - appdirs==1.4.4 + - async-timeout==4.0.2 + - attrs==23.1.0 + - cachetools==5.3.0 + - click==8.1.3 + - contourpy==1.0.7 + - cycler==0.11.0 + - cython==0.29.33 + - dnspython==2.3.0 + - docker-pycreds==0.4.0 + - einops==0.6.0 + - eventlet==0.33.3 + - fonttools==4.39.2 + - frozenlist==1.3.3 + - fsspec==2023.4.0 + - gco-wrapper==3.0.8 + - gitdb==4.0.10 + - gitpython==3.1.31 + - google-auth==2.16.2 + - google-auth-oauthlib==0.4.6 + - greenlet==2.0.2 + - grpcio==1.51.3 + - histolab==0.6.0 + - huggingface-hub==0.13.2 + - imageio==2.26.0 + - importlib-metadata==6.1.0 + - importlib-resources==5.12.0 + - intel-openmp==2023.0.0 + - joblib==1.2.0 + - kaggle==1.5.13 + - kiwisolver==1.4.4 + - lightning-utilities==0.8.0 + - markdown==3.4.1 + - matplotlib==3.7.1 + - mkl==2023.0.0 + - multidict==6.0.4 + - notifyemail==1.0.2 + - numpy==1.23.0 + - oauthlib==3.2.2 + - opencv-contrib-python==4.7.0.72 + - opencv-python==4.7.0.72 + - openslide-python==1.2.0 + - pandas==1.5.3 + - pathtools==0.1.2 + - protobuf==4.22.1 + - pyasn1==0.4.8 + - pyasn1-modules==0.2.8 + - pyparsing==3.0.9 + - python-slugify==8.0.1 + - pytorch-lightning==2.0.2 + - pytz==2022.7.1 + - pywavelets==1.4.1 + - pyyaml==6.0 + - requests-oauthlib==1.3.1 + - rsa==4.9 + - scikit-image==0.19.3 + - scikit-learn==1.2.2 + - scipy==1.8.1 + - sentry-sdk==1.17.0 + - setproctitle==1.3.2 + - smmap==5.0.0 + - spams==2.6.5.4 + - staintools==2.1.2 + - tbb==2021.8.0 + - tensorboard==2.12.0 + - tensorboard-data-server==0.7.0 + - tensorboard-plugin-wit==1.8.1 + - termcolor==2.3.0 + - text-unidecode==1.3 + - threadpool==1.3.2 + - threadpoolctl==3.1.0 + - tifffile==2023.3.15 + - timm==0.6.12 + - torchmetrics==0.11.4 + - tqdm==4.65.0 + - wandb==0.14.0 + - werkzeug==2.2.3 + - yacs==0.1.8 + - yarl==1.9.2 + - zipp==3.15.0 +prefix: /root/miniconda3/envs/SimMIM diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/requirements.txt b/PuzzleTuning/Counterpart PreTrain Methods/simclr/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4231406fdab2ef741f492265e308aa05aeb3b6f4 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/requirements.txt @@ -0,0 +1,107 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +_libgcc_mutex=0.1=main +absl-py=0.9.0=pypi_0 +blas=1.0=mkl +bzip2=1.0.8=h516909a_2 +ca-certificates=2019.11.28=hecc5488_0 +cachetools=4.0.0=pypi_0 +cairo=1.14.12=h80bd089_1005 +certifi=2019.11.28=py37hc8dfbb8_1 +chardet=3.0.4=pypi_0 +cudatoolkit=10.1.243=h6bb024c_0 +ffmpeg=4.0.2=ha0c5888_2 +fontconfig=2.13.1=he4413a7_1000 +freeglut=3.0.0=hf484d3e_1005 +freetype=2.9.1=h8a8886c_1 +gettext=0.19.8.1=hc5be6a0_1002 +glib=2.56.2=had28632_1001 +gmp=6.1.2=hf484d3e_1000 +gnutls=3.5.19=h2a4e5f8_1 +google-auth=1.11.3=pypi_0 +google-auth-oauthlib=0.4.1=pypi_0 +graphite2=1.3.13=hf484d3e_1000 +grpcio=1.27.2=pypi_0 +harfbuzz=1.9.0=he243708_1001 +hdf5=1.10.2=hc401514_3 +icu=58.2=hf484d3e_1000 +idna=2.9=pypi_0 +intel-openmp=2020.0=166 +jasper=2.0.14=h07fcdf6_1 +jpeg=9b=h024ee3a_2 +ld_impl_linux-64=2.33.1=h53a641e_7 +libedit=3.1.20181209=hc058e9b_0 +libffi=3.2.1=hd88cf55_4 +libgcc-ng=9.1.0=hdf63c60_0 +libgfortran=3.0.0=1 +libgfortran-ng=7.3.0=hdf63c60_0 +libglu=9.0.0=hf484d3e_1000 +libiconv=1.15=h516909a_1005 +libopencv=3.4.2=hb342d67_1 +libpng=1.6.37=hbc83047_0 +libstdcxx-ng=9.1.0=hdf63c60_0 +libtiff=4.1.0=h2733197_0 +libuuid=2.32.1=h14c3975_1000 +libxcb=1.13=h14c3975_1002 +libxml2=2.9.9=h13577e0_2 +markdown=3.2.1=pypi_0 +mkl=2020.0=166 +mkl-service=2.3.0=py37he904b0f_0 +mkl_fft=1.0.15=py37ha843d7b_0 +mkl_random=1.1.0=py37hd6b4f25_0 +ncurses=6.2=he6710b0_0 +nettle=3.3=0 +ninja=1.9.0=py37hfd86e86_0 +numpy=1.18.1=py37h4f9e942_0 +numpy-base=1.18.1=py37hde5b4d6_1 +oauthlib=3.1.0=pypi_0 +olefile=0.46=py37_0 +opencv=3.4.2=py37h6fd60c2_1 +openh264=1.8.0=hdbcaa40_1000 +openssl=1.1.1d=h516909a_0 +pcre=8.44=he1b5a44_0 +pillow=7.0.0=py37hb39fc2d_0 +pip=20.0.2=py37_1 +pixman=0.34.0=h14c3975_1003 +protobuf=3.11.3=pypi_0 +pthread-stubs=0.4=h14c3975_1001 +py-opencv=3.4.2=py37hb342d67_1 +pyasn1=0.4.8=pypi_0 +pyasn1-modules=0.2.8=pypi_0 +python=3.7.6=h0371630_2 +python_abi=3.7=1_cp37m +pytorch=1.4.0=py3.7_cuda10.1.243_cudnn7.6.3_0 +pyyaml=5.3=pypi_0 +readline=7.0=h7b6447c_5 +requests=2.23.0=pypi_0 +requests-oauthlib=1.3.0=pypi_0 +rsa=4.0=pypi_0 +setuptools=46.0.0=py37_0 +six=1.14.0=py37_0 +sqlite=3.31.1=h7b6447c_0 +tensorboard=2.1.1=pypi_0 +tk=8.6.8=hbc83047_0 +torchvision=0.5.0=py37_cu101 +urllib3=1.25.8=pypi_0 +werkzeug=1.0.0=pypi_0 +wheel=0.34.2=py37_0 +x264=1!152.20180806=h14c3975_0 +xorg-fixesproto=5.0=h14c3975_1002 +xorg-inputproto=2.3.2=h14c3975_1002 +xorg-kbproto=1.0.7=h14c3975_1002 +xorg-libice=1.0.10=h516909a_0 +xorg-libsm=1.2.3=h84519dc_1000 +xorg-libx11=1.6.9=h516909a_0 +xorg-libxau=1.0.9=h14c3975_0 +xorg-libxdmcp=1.1.3=h516909a_0 +xorg-libxext=1.3.4=h516909a_0 +xorg-libxfixes=5.0.3=h516909a_1004 +xorg-libxi=1.7.10=h516909a_0 +xorg-libxrender=0.9.10=h516909a_1002 +xorg-renderproto=0.11.1=h14c3975_1002 +xorg-xextproto=7.3.0=h14c3975_1002 +xorg-xproto=7.0.31=h14c3975_1007 +xz=5.2.4=h14c3975_4 +zlib=1.2.11=h7b6447c_3 +zstd=1.3.7=h0b5b093_0 diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/run.py b/PuzzleTuning/Counterpart PreTrain Methods/simclr/run.py new file mode 100644 index 0000000000000000000000000000000000000000..1b4e09e0dde21f4e72c9532ca478d57f9d186d67 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/run.py @@ -0,0 +1,92 @@ +import argparse +import torch +import torch.backends.cudnn as cudnn +from torchvision import models +from data_aug.contrastive_learning_dataset import ContrastiveLearningDataset +from models.resnet_simclr import ResNetSimCLR +from simclr import SimCLR + +model_names = sorted(name for name in models.__dict__ + if name.islower() and not name.startswith("__") + and callable(models.__dict__[name])) + +parser = argparse.ArgumentParser(description='PyTorch SimCLR') +parser.add_argument('--data', metavar='DIR', default='./datasets', + help='path to dataset') +parser.add_argument('--dataset-name', default='stl10', + help='dataset name', choices=['stl10', 'cifar10', 'local']) +parser.add_argument('--mode', default='train', + help='train val test stage', choices=['train', 'val', 'test']) +parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', + choices=model_names, + help='model architecture: ' + + ' | '.join(model_names) + + ' (default: resnet50)') +parser.add_argument('-j', '--workers', default=12, type=int, metavar='N', + help='number of data loading workers (default: 32)') +parser.add_argument('--epochs', default=200, type=int, metavar='N', + help='number of total epochs to run') +parser.add_argument('-b', '--batch-size', default=256, type=int, + metavar='N', + help='mini-batch size (default: 256), this is the total ' + 'batch size of all GPUs on the current node when ' + 'using Data Parallel or Distributed Data Parallel') +parser.add_argument('-lr', '--learning-rate', default=0.0003, type=float, + metavar='LR', help='initial learning rate', dest='lr') +parser.add_argument('-wd', '--weight-decay', default=1e-4, type=float, + metavar='W', help='weight decay (default: 1e-4)', + dest='weight_decay') +parser.add_argument('--seed', default=None, type=int, + help='seed for initializing training. ') +parser.add_argument('--disable-cuda', action='store_true', + help='Disable CUDA') +parser.add_argument('--fp16-precision', action='store_true', + help='Whether or not to use 16-bit precision GPU training.') + +parser.add_argument('--out_dim', default=128, type=int, + help='feature dimension (default: 128)') +parser.add_argument('--log-every-n-steps', default=100, type=int, + help='Log every n steps') +parser.add_argument('--temperature', default=0.07, type=float, + help='softmax temperature (default: 0.07)') +parser.add_argument('--n-views', default=2, type=int, metavar='N', + help='Number of views for contrastive learning training.') +parser.add_argument('--gpu-index', default=0, type=int, help='Gpu index.') + + +def main(): + args = parser.parse_args() + assert args.n_views == 2, "Only two view training is supported. Please use --n-views 2." + # check if gpu training is available + if not args.disable_cuda and torch.cuda.is_available(): + args.device = torch.device('cuda') + cudnn.deterministic = True + cudnn.benchmark = True + else: + args.device = torch.device('cpu') + args.gpu_index = -1 + + dataset = ContrastiveLearningDataset(args.data) + + train_dataset = dataset.get_dataset(args.dataset_name, args.n_views, mode=args.mode) + + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=True, + num_workers=args.workers, pin_memory=True, drop_last=True) + + model = ResNetSimCLR(base_model=args.arch, out_dim=args.out_dim) + + optimizer = torch.optim.Adam(model.parameters(), args.lr, weight_decay=args.weight_decay) + + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_loader), eta_min=0, + last_epoch=-1) + + # It’s a no-op if the 'gpu_index' argument is a negative integer or None. + with torch.cuda.device(args.gpu_index): + simclr = SimCLR(model=model, optimizer=optimizer, scheduler=scheduler, args=args) + simclr.train(train_loader) + # simclr.train_pretend(train_loader) + + +if __name__ == "__main__": + main() diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/run_vit.py b/PuzzleTuning/Counterpart PreTrain Methods/simclr/run_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..e45629279d13440d4b3346969ed25f15f5db1d61 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/run_vit.py @@ -0,0 +1,152 @@ +import argparse +import torch +import torch.backends.cudnn as cudnn +from torchvision import models +from data_aug.contrastive_learning_dataset import ContrastiveLearningDataset +from models.resnet_simclr import ViTSimCLR +from simclr import SimCLR +import os + +model_names = sorted(name for name in models.__dict__ + if name.islower() and not name.startswith("__") + and callable(models.__dict__[name])) + + +def get_args_parser(): + parser = argparse.ArgumentParser(description='PyTorch SimCLR') + + # Dataset related + parser.add_argument('--data', metavar='DIR', default='./datasets', + help='path to dataset') + parser.add_argument('--dataset-name', default='stl10', + help='dataset name', choices=['stl10', 'cifar10', 'imagefolder', 'cpia-mini']) + parser.add_argument('--mode', default='train', + help='train val test stage', choices=['train', 'val', 'test']) + + # Training related + parser.add_argument('-j', '--workers', default=40, type=int, metavar='N', + help='number of data loading workers (default: 32)') + parser.add_argument('--epochs', default=200, type=int, metavar='N', + help='number of total epochs to run') + parser.add_argument('--batch_size', default=256, type=int, + metavar='N', + help='mini-batch size (default: 256), this is the total ' + 'batch size of all GPUs on the current node when ' + 'using Data Parallel or Distributed Data Parallel') + parser.add_argument('-lr', '--learning-rate', default=0.0003, type=float, + metavar='LR', help='initial learning rate', dest='lr') + parser.add_argument('-wd', '--weight-decay', default=1e-4, type=float, + metavar='W', help='weight decay (default: 1e-4)', + dest='weight_decay') + parser.add_argument('--seed', default=42, type=int, + help='seed for initializing training. ') + parser.add_argument('--disable-cuda', action='store_true', + help='Disable CUDA') + parser.add_argument('--fp16-precision', action='store_true', + help='Whether or not to use 16-bit precision GPU training.') + parser.add_argument('--out_dim', default=128, type=int, + help='feature dimension (default: 128)') + parser.add_argument('--log-every-n-steps', default=100, type=int, + help='Log every n steps') + parser.add_argument('--temperature', default=0.07, type=float, + help='softmax temperature (default: 0.07)') + parser.add_argument('--n-views', default=2, type=int, metavar='N', + help='Number of views for contrastive learning training.') + parser.add_argument('--gpu-index', default=0, type=int, help='Gpu index.') + + # Model related + parser.add_argument('-a', '--arch', type=str, default='vit_base_patch16_224', + help='model architecture.') + parser.add_argument('--load_weight', type=str, help='model weight directory.') + parser.add_argument('--img_size', type=int, default=224, help='image size. For vit: 224, for resnet: 96.') + + # added + parser.add_argument('--log_dir', default=' ', + help='path where to tensorboard log') + parser.add_argument('--output_dir', default=' ', + help='path where to store checkpoints') + parser.add_argument('--init_weight_pth', default='', type=str, + help="init weight path") + parser.add_argument('--enable_notify', action='store_true', help='enable notify to send email') + return parser + + +def main(args): + + if args.enable_notify: + import notifyemail as notify + + notify.Reboost(mail_host='smtp.163.com', mail_user='tum9598@163.com', mail_pass='EXVGQACCXPUIUQAE', + default_reciving_list=['foe3305@163.com'], # change here if u want to use notify + log_root_path='log', max_log_cnt=5) + notify.add_text('SimCLR Training') + notify.add_text('------') + for a in str(args).split(','): + notify.add_text(a) + notify.add_text('------') + notify.send_log() + + assert args.n_views == 2, "Only two view training is supported. Please use --n-views 2." + # check if gpu training is available + if not args.disable_cuda and torch.cuda.is_available(): + args.device = torch.device('cuda') + cudnn.deterministic = True + cudnn.benchmark = True + else: + args.device = torch.device('cpu') + args.gpu_index = -1 + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + print("{}".format(args).replace(', ', ',\n')) + + dataset = ContrastiveLearningDataset(args.data) + + train_dataset = dataset.get_dataset(args.dataset_name, args.n_views, mode=args.mode, img_size=args.img_size) + + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=True, + num_workers=args.workers, pin_memory=True, drop_last=True) + + model = ViTSimCLR(base_model=args.arch, out_dim=args.out_dim, load_weight=args.load_weight) + + # load weight from file + if args.init_weight_pth: + print(f'Loading weight from {args.init_weight_pth}...') + init_weight = torch.load(args.init_weight_pth) + model.load_state_dict(init_weight, strict=False) + print('Weight loaded.') + + model = model.to(args.device) + + model = torch.nn.DataParallel(model) + + optimizer = torch.optim.Adam(model.parameters(), args.lr, weight_decay=args.weight_decay) + + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs, eta_min=0, + last_epoch=-1) + + start_epoch = 0 + if args.load_weight: # load from checkpoint + checkpoint = torch.load(args.load_weight) + model.load_state_dict(checkpoint['state_dict'], strict=True) + optimizer.load_state_dict(checkpoint['optimizer']) + # scheduler.load_state_dict(checkpoint['scheduler']) + start_epoch = int(checkpoint['epoch']) + # TODO: AAAAAAAA it become constant here!!!!! + scheduler.last_epoch = start_epoch + print(f"Loaded weights from: {args.load_weight}, starting epoch: {start_epoch}") + + # It’s a no-op if the 'gpu_index' argument is a negative integer or None. + with torch.cuda.device(args.gpu_index): + simclr = SimCLR(model=model, optimizer=optimizer, scheduler=scheduler, args=args) + simclr.train(start_epoch, train_loader) + # simclr.train_pretend(train_loader) + + +if __name__ == "__main__": + args = get_args_parser() + args = args.parse_args() + + main(args) diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/simclr.py b/PuzzleTuning/Counterpart PreTrain Methods/simclr/simclr.py new file mode 100644 index 0000000000000000000000000000000000000000..b9d3fa92437590f5aaba607593430c20c6f4cfa9 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/simclr.py @@ -0,0 +1,164 @@ +import logging +import os +import sys + +import torch +import torch.nn.functional as F +from torch.cuda.amp import GradScaler, autocast +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm +from utils import save_config_file, accuracy, save_checkpoint +import time + +torch.manual_seed(0) + + +def time_to_str(t, mode='sec'): + """Formatted time""" + if mode=='min': + t = int(t)/60 + hr = t//60 + min = t%60 + return '%2d hr %02d min'%(hr,min) + elif mode=='sec': + t = int(t) + min = t//60 + sec = t%60 + return '%2d min %02d sec'%(min,sec) + else: + raise NotImplementedError + + +class SimCLR(object): + + def __init__(self, *args, **kwargs): + self.args = kwargs['args'] + self.model = kwargs['model'].to(self.args.device) + self.optimizer = kwargs['optimizer'] + self.scheduler = kwargs['scheduler'] + self.writer = SummaryWriter(log_dir=self.args.log_dir) + self.output_dir = self.args.output_dir + logging.basicConfig(filename=os.path.join(self.writer.log_dir, 'training.log'), level=logging.DEBUG) + self.criterion = torch.nn.CrossEntropyLoss().to(self.args.device) + + def info_nce_loss(self, features): + + # labels: [B] -> [2B], [512] + labels = torch.cat([torch.arange(self.args.batch_size) for i in range(self.args.n_views)], dim=0) + # labels: [2B] -> [2B, 2B], [512, 512] + labels = (labels.unsqueeze(0) == labels.unsqueeze(1)).float() + labels = labels.to(self.args.device) + + # features: [2B, CLS], [512, 128] + features = F.normalize(features, dim=1) + + # similarity_matrix: [2B, 2B], [512, 512] + similarity_matrix = torch.matmul(features, features.T) + # assert similarity_matrix.shape == ( + # self.args.n_views * self.args.batch_size, self.args.n_views * self.args.batch_size) + # assert similarity_matrix.shape == labels.shape + + # discard the main diagonal from both: labels and similarities matrix + mask = torch.eye(labels.shape[0], dtype=torch.bool).to(self.args.device) # [512, 512] + labels = labels[~mask].view(labels.shape[0], -1) # [512, 512] -> [512, 511] + similarity_matrix = similarity_matrix[~mask].view(similarity_matrix.shape[0], -1) # [512, 512] -> [512, 511] + # assert similarity_matrix.shape == labels.shape + + # select and combine multiple positives [512, 1] + positives = similarity_matrix[labels.bool()].view(labels.shape[0], -1) + + # select only the negatives the negatives [512, 510] + negatives = similarity_matrix[~labels.bool()].view(similarity_matrix.shape[0], -1) + + # [512, 510+1] -> [512, 511] + logits = torch.cat([positives, negatives], dim=1) + + # [512] + labels = torch.zeros(logits.shape[0], dtype=torch.long).to(self.args.device) + + logits = logits / self.args.temperature + return logits, labels + + def train(self, start_epoch, train_loader): + + scaler = GradScaler(enabled=self.args.fp16_precision) + + # save config file + save_config_file(self.writer.log_dir, self.args) + + n_iter = 0 + logging.info(f"Start SimCLR training for {self.args.epochs} epochs.") + logging.info(f"Training with: {self.args.device}.") + + for epoch_counter in range(start_epoch, self.args.epochs): + + time_start = time.time() + n_batch = 0 + for images, _ in tqdm(train_loader, desc=f'Epoch {epoch_counter}'): + images = torch.cat(images, dim=0) + + images = images.to(self.args.device) + + with autocast(enabled=self.args.fp16_precision): + features = self.model(images) + logits, labels = self.info_nce_loss(features) + loss = self.criterion(logits, labels) + + self.optimizer.zero_grad() + + scaler.scale(loss).backward() + + scaler.step(self.optimizer) + scaler.update() + + top1, top5 = accuracy(logits, labels, topk=(1, 5)) + if n_iter % self.args.log_every_n_steps == 0 and n_iter != 0: + # top1, top5 = accuracy(logits, labels, topk=(1, 5)) + self.writer.add_scalar('loss', loss, global_step=n_iter) + self.writer.add_scalar('acc/top1', top1[0], global_step=n_iter) + self.writer.add_scalar('acc/top5', top5[0], global_step=n_iter) + self.writer.add_scalar('learning_rate', self.scheduler.get_lr()[0], global_step=n_iter) + + if n_batch % self.args.log_every_n_steps == 0: + # Show training status + current_stat = 'lr: {:.7f}\t| epoch: {}\t| batch: {:.0f}/{}\t| loss: {:.3f}\t| time: {}'.format( + self.optimizer.state_dict()['param_groups'][0]['lr'], + epoch_counter, + n_batch, + len(train_loader)-1, + loss.item(), + time_to_str((time.time() - time_start), 'sec') + ) + logging.info(current_stat) + # logging.debug(f"Batch: {n_batch}\{len(train_loader)}\tLoss: {loss}\tTop1 accuracy: {top1[0]}") + n_iter += 1 + n_batch += 1 + + # warmup for the first 10 epochs + if epoch_counter >= 10: + self.scheduler.step() + logging.debug(f"Epoch: {epoch_counter}\tLoss: {loss}\tTop1 accuracy: {top1[0]}") + + # Save result evert 20 epochs + if epoch_counter % 20 == 0 and epoch_counter != 0: + checkpoint_name = 'checkpoint_{:04d}.pth.tar'.format(epoch_counter) + save_checkpoint({ + 'epoch': epoch_counter, + 'arch': self.args.arch, + 'scheduler': self.scheduler.state_dict(), + 'state_dict': self.model.state_dict(), + 'optimizer': self.optimizer.state_dict(), + }, is_best=False, filename=os.path.join(self.output_dir, checkpoint_name)) + logging.info(f"Model checkpoint and metadata has been saved at {self.writer.log_dir}.") + + logging.info("Training has finished.") + # save model checkpoints + checkpoint_name = 'checkpoint_{:04d}.pth.tar'.format(self.args.epochs) + save_checkpoint({ + 'epoch': self.args.epochs, + 'arch': self.args.arch, + 'scheduler': self.scheduler.state_dict(), + 'state_dict': self.model.state_dict(), + 'optimizer': self.optimizer.state_dict(), + }, is_best=False, filename=os.path.join(self.output_dir, checkpoint_name)) + logging.info(f"Model checkpoint and metadata has been saved at {self.writer.log_dir}.") \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simclr/utils.py b/PuzzleTuning/Counterpart PreTrain Methods/simclr/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..cf92cbd8ee0aafd0316028bd8b2a63e0ffbbd805 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simclr/utils.py @@ -0,0 +1,35 @@ +import os +import shutil + +import torch +import yaml + + +def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, 'model_best.pth.tar') + + +def save_config_file(model_checkpoints_folder, args): + if not os.path.exists(model_checkpoints_folder): + os.makedirs(model_checkpoints_folder) + with open(os.path.join(model_checkpoints_folder, 'config.yml'), 'w') as outfile: + yaml.dump(args, outfile, default_flow_style=False) + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/CODE_OF_CONDUCT.md b/PuzzleTuning/Counterpart PreTrain Methods/simmim/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..f9ba8cf65f3e3104dd061c178066ec8247811f33 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/CODE_OF_CONDUCT.md @@ -0,0 +1,9 @@ +# Microsoft Open Source Code of Conduct + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). + +Resources: + +- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) +- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/LICENSE b/PuzzleTuning/Counterpart PreTrain Methods/simmim/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..9e841e7a26e4eb057b24511e7b92d42b257a80e5 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/LICENSE @@ -0,0 +1,21 @@ + MIT License + + Copyright (c) Microsoft Corporation. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/README.md b/PuzzleTuning/Counterpart PreTrain Methods/simmim/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9fdf785b92d5e8b43f967912a3a5e0a69ef69a73 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/README.md @@ -0,0 +1,14 @@ +# SimMIM + +The original repo of SimMIM could be found [here](https://github.com/microsoft/SimMIM) + +To install environments: +```bash +pip install -r requirements.txt +``` + +To start pretraining: +```bash +# You need to alter the script according to your directories +bash pretrain.sh +``` diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/README_origin.md b/PuzzleTuning/Counterpart PreTrain Methods/simmim/README_origin.md new file mode 100644 index 0000000000000000000000000000000000000000..ea9a2c111e621df80505c920b945dd1fc5137c4f --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/README_origin.md @@ -0,0 +1,156 @@ +# SimMIM + +By [Zhenda Xie](https://zdaxie.github.io)\*, [Zheng Zhang](https://stupidzz.github.io/)\*, [Yue Cao](http://yue-cao.me)\*, [Yutong Lin](https://github.com/impiga), [Jianmin Bao](https://jianminbao.github.io/), [Zhuliang Yao](https://github.com/Howal), [Qi Dai](https://www.microsoft.com/en-us/research/people/qid/) and [Han Hu](https://ancientmooner.github.io/)\*. + +This repo is the official implementation of ["SimMIM: A Simple Framework for Masked Image Modeling"](https://arxiv.org/abs/2111.09886). + +## Updates + +***09/29/2022*** + +SimMIM was merged to [Swin Transformer repo on GitHub](https://github.com/microsoft/Swin-Transformer). + +***03/02/2022*** + +SimMIM got accepted by CVPR 2022. SimMIM was used in ["Swin Transformer V2"](https://github.com/microsoft/Swin-Transformer) to alleviate the data hungry problem for large-scale vision model training. + +***12/09/2021*** + +Initial commits: + +1. Pre-trained and fine-tuned models on ImageNet-1K (`Swin Base`, `Swin Large`, and `ViT Base`) are provided. +2. The supported code for ImageNet-1K pre-training and fine-tuneing is provided. + +## Introduction + +**SimMIM** is initially described in [arxiv](https://arxiv.org/abs/2111.09886), which serves as a +simple framework for masked image modeling. From systematically study, we find that simple designs of each component have revealed very strong representation learning performance: 1) random masking of the input image with a moderately large masked patch size (e.g., 32) makes a strong pre-text task; 2) predicting raw pixels of RGB values by direct regression performs no worse than the patch classification approaches with complex designs; 3) the prediction head can be as light as a linear layer, with no worse performance than heavier ones. + +
+ +
+ +## Main Results on ImageNet + +### Swin Transformer + +**ImageNet-1K Pre-trained and Fine-tuned Models** + +| name | pre-train epochs | pre-train resolution | fine-tune resolution | acc@1 | pre-trained model | fine-tuned model | +| :---: | :---: | :---: | :---: | :---: | :---: | :---: | +| Swin-Base | 100 | 192x192 | 192x192 | 82.8 | [google](https://drive.google.com/file/d/1Wcbr66JL26FF30Kip9fZa_0lXrDAKP-d/view?usp=sharing)/[config](configs/swin_base__100ep/simmim_pretrain__swin_base__img192_window6__100ep.yaml) | [google](https://drive.google.com/file/d/1RsgHfjB4B1ZYblXEQVT-FPX3WSvBrxcs/view?usp=sharing)/[config](configs/swin_base__100ep/simmim_finetune__swin_base__img192_window6__100ep.yaml) | +| Swin-Base | 100 | 192x192 | 224x224 | 83.5 | [google](https://drive.google.com/file/d/1Wcbr66JL26FF30Kip9fZa_0lXrDAKP-d/view?usp=sharing)/[config](configs/swin_base__100ep/simmim_pretrain__swin_base__img192_window6__100ep.yaml) | [google](https://drive.google.com/file/d/1mb43BkW56F5smwiX-g7QUUD7f1Rftq8u/view?usp=sharing)/[config](configs/swin_base__100ep/simmim_finetune__swin_base__img224_window7__100ep.yaml) | +| Swin-Base | 800 | 192x192 | 224x224 | 84.0 | [google](https://drive.google.com/file/d/15zENvGjHlM71uKQ3d2FbljWPubtrPtjl/view?usp=sharing)/[config](configs/swin_base__800ep/simmim_pretrain__swin_base__img192_window6__800ep.yaml) | [google](https://drive.google.com/file/d/1xEKyfMTsdh6TfnYhk5vbw0Yz7a-viZ0w/view?usp=sharing)/[config](configs/swin_base__800ep/simmim_finetune__swin_base__img224_window7__800ep.yaml) | +| Swin-Large | 800 | 192x192 | 224x224 | 85.4 | [google](https://drive.google.com/file/d/1qDxrTl2YUDB0505_4QrU5LU2R1kKmcBP/view?usp=sharing)/[config](configs/swin_large__800ep/simmim_pretrain__swin_large__img192_window12__800ep.yaml) | [google](https://drive.google.com/file/d/1mf0ZpXttEvFsH87Www4oQ-t8Kwr0x485/view?usp=sharing)/[config](configs/swin_large__800ep/simmim_finetune__swin_large__img224_window14__800ep.yaml) | +| SwinV2-Huge | 800 | 192x192 | 224x224 | 85.7 | / | / | +| SwinV2-Huge | 800 | 192x192 | 512x512 | 87.1 | / | / | + +### Vision Transformer + +**ImageNet-1K Pre-trained and Fine-tuned Models** + +| name | pre-train epochs | pre-train resolution | fine-tune resolution | acc@1 | pre-trained model | fine-tuned model | +| :---: | :---: | :---: | :---: | :---: | :---: | :---: | +| ViT-Base | 800 | 224x224 | 224x224 | 83.8 | [google](https://drive.google.com/file/d/1dJn6GYkwMIcoP3zqOEyW1_iQfpBi8UOw/view?usp=sharing)/[config](configs/vit_base__800ep/simmim_pretrain__vit_base__img224__800ep.yaml) | [google](https://drive.google.com/file/d/1fKgDYd0tRgyHyTnyB1CleYxjo0Gn5tEB/view?usp=sharing)/[config](configs/vit_base__800ep/simmim_finetune__vit_base__img224__800ep.yaml) | + +## Citing SimMIM + +``` +@inproceedings{xie2021simmim, + title={SimMIM: A Simple Framework for Masked Image Modeling}, + author={Xie, Zhenda and Zhang, Zheng and Cao, Yue and Lin, Yutong and Bao, Jianmin and Yao, Zhuliang and Dai, Qi and Hu, Han}, + booktitle={International Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2022} +} +``` + +## Getting Started + +### Installation + +- Install `CUDA 11.3` with `cuDNN 8` following the official installation guide of [CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) and [cuDNN](https://developer.nvidia.com/rdp/cudnn-archive). + +- Setup conda environment: +```bash +# Create environment +conda create -n SimMIM python=3.8 -y +conda activate SimMIM + +# Install requirements +conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch -y + +# Install apex +git clone https://github.com/NVIDIA/apex +cd apex +pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ +cd .. + +# Clone SimMIM +git clone https://github.com/microsoft/SimMIM +cd SimMIM + +# Install other requirements +pip install -r requirements.txt +``` + +### Evaluating provided models + +To evaluate a provided model on ImageNet validation set, run: +```bash +python -m torch.distributed.launch --nproc_per_node main_finetune.py \ +--eval --cfg --resume --data-path +``` + +For example, to evaluate the `Swin Base` model on a single GPU, run: +```bash +python -m torch.distributed.launch --nproc_per_node 1 main_finetune.py \ +--eval --cfg configs/swin_base__800ep/simmim_finetune__swin_base__img224_window7__800ep.yaml --resume simmim_finetune__swin_base__img224_window7__800ep.pth --data-path +``` + +### Pre-training with SimMIM +To pre-train models with `SimMIM`, run: +```bash +python -m torch.distributed.launch --nproc_per_node main_simmim.py \ +--cfg --data-path /train [--batch-size --output --tag ] +``` + +For example, to pre-train `Swin Base` for 800 epochs on one DGX-2 server, run: +```bash +python -m torch.distributed.launch --nproc_per_node 16 main_simmim.py \ +--cfg configs/swin_base__800ep/simmim_pretrain__swin_base__img192_window6__800ep.yaml --batch-size 128 --data-path /train [--output --tag ] +``` + +### Fine-tuning pre-trained models +To fine-tune models pre-trained by `SimMIM`, run: +```bash +python -m torch.distributed.launch --nproc_per_node main_finetune.py \ +--cfg --data-path --pretrained [--batch-size --output --tag ] +``` + +For example, to fine-tune `Swin Base` pre-trained by `SimMIM` on one DGX-2 server, run: +```bash +python -m torch.distributed.launch --nproc_per_node 16 main_finetune.py \ +--cfg configs/swin_base__800ep/simmim_finetune__swin_base__img224_window7__800ep.yaml --batch-size 128 --data-path --pretrained [--output --tag ] +``` + +## Contributing + +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions +provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or +contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + +## Trademarks + +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft +trademarks or logos is subject to and must follow +[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). +Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. +Any use of third-party trademarks or logos are subject to those third-party's policies. diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/SECURITY.md b/PuzzleTuning/Counterpart PreTrain Methods/simmim/SECURITY.md new file mode 100644 index 0000000000000000000000000000000000000000..f7b89984f0fb5dd204028bc525e19eefc0859f4f --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/SECURITY.md @@ -0,0 +1,41 @@ + + +## Security + +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). + +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. + +## Reporting Security Issues + +**Please do not report security vulnerabilities through public GitHub issues.** + +Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). + +If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). + +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). + +Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: + + * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) + * Full paths of source file(s) related to the manifestation of the issue + * The location of the affected source code (tag/branch/commit or direct URL) + * Any special configuration required to reproduce the issue + * Step-by-step instructions to reproduce the issue + * Proof-of-concept or exploit code (if possible) + * Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. + +If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. + +## Preferred Languages + +We prefer all communications to be in English. + +## Policy + +Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). + + \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/SUPPORT.md b/PuzzleTuning/Counterpart PreTrain Methods/simmim/SUPPORT.md new file mode 100644 index 0000000000000000000000000000000000000000..dc72f0e5a0bc2807bf3df31dbc7455e6991b127a --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/SUPPORT.md @@ -0,0 +1,25 @@ +# TODO: The maintainer of this repo has not yet edited this file + +**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? + +- **No CSS support:** Fill out this template with information about how to file issues and get help. +- **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). +- **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. + +*Then remove this first heading from this SUPPORT.MD file before publishing your repo.* + +# Support + +## How to file issues and get help + +This project uses GitHub Issues to track bugs and feature requests. Please search the existing +issues before filing new issues to avoid duplicates. For new issues, file your bug or +feature request as a new Issue. + +For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE +FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER +CHANNEL. WHERE WILL YOU HELP PEOPLE?**. + +## Microsoft Support Policy + +Support for this **PROJECT or PRODUCT** is limited to the resources listed above. diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/config.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/config.py new file mode 100644 index 0000000000000000000000000000000000000000..a8c7735b75b46c36db276e5addeccd875dedb610 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/config.py @@ -0,0 +1,264 @@ +# -------------------------------------------------------- +# SimMIM +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu +# Modified by Zhenda Xie +# -------------------------------------------------------- + +import os +import yaml +from yacs.config import CfgNode as CN + +_C = CN() + +# Base config files +_C.BASE = [''] + +# ----------------------------------------------------------------------------- +# Data settings +# ----------------------------------------------------------------------------- +_C.DATA = CN() +# Batch size for a single GPU, could be overwritten by command line argument +_C.DATA.BATCH_SIZE = 128 +# Path to dataset, could be overwritten by command line argument +_C.DATA.DATA_PATH = '' +# Dataset name +_C.DATA.DATASET = 'imagenet' +# Input image size +_C.DATA.IMG_SIZE = 224 +# Interpolation to resize image (random, bilinear, bicubic) +_C.DATA.INTERPOLATION = 'bicubic' +# Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU. +_C.DATA.PIN_MEMORY = True +# Number of data loading threads +_C.DATA.NUM_WORKERS = 8 + +# [SimMIM] Mask patch size for MaskGenerator +_C.DATA.MASK_PATCH_SIZE = 32 +# [SimMIM] Mask ratio for MaskGenerator +_C.DATA.MASK_RATIO = 0.6 + +# ----------------------------------------------------------------------------- +# Model settings +# ----------------------------------------------------------------------------- +_C.MODEL = CN() +# Model type +_C.MODEL.TYPE = 'swin' +# Model name +_C.MODEL.NAME = 'swin_tiny_patch4_window7_224' +# Checkpoint to resume, could be overwritten by command line argument +_C.MODEL.RESUME = '' +# Number of classes, overwritten in data preparation +_C.MODEL.NUM_CLASSES = 1000 +# Dropout rate +_C.MODEL.DROP_RATE = 0.0 +# Drop path rate +_C.MODEL.DROP_PATH_RATE = 0.1 +# Label Smoothing +_C.MODEL.LABEL_SMOOTHING = 0.1 + +# Swin Transformer parameters +_C.MODEL.SWIN = CN() +_C.MODEL.SWIN.PATCH_SIZE = 4 +_C.MODEL.SWIN.IN_CHANS = 3 +_C.MODEL.SWIN.EMBED_DIM = 96 +_C.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] +_C.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] +_C.MODEL.SWIN.WINDOW_SIZE = 7 +_C.MODEL.SWIN.MLP_RATIO = 4. +_C.MODEL.SWIN.QKV_BIAS = True +_C.MODEL.SWIN.QK_SCALE = None +_C.MODEL.SWIN.APE = False +_C.MODEL.SWIN.PATCH_NORM = True + +# Vision Transformer parameters +_C.MODEL.VIT = CN() +_C.MODEL.VIT.PATCH_SIZE = 16 +_C.MODEL.VIT.IN_CHANS = 3 +_C.MODEL.VIT.EMBED_DIM = 768 +_C.MODEL.VIT.DEPTH = 12 +_C.MODEL.VIT.NUM_HEADS = 12 +_C.MODEL.VIT.MLP_RATIO = 4 +_C.MODEL.VIT.QKV_BIAS = True +_C.MODEL.VIT.INIT_VALUES = 0.1 +_C.MODEL.VIT.USE_APE = False +_C.MODEL.VIT.USE_RPB = False +_C.MODEL.VIT.USE_SHARED_RPB = True +_C.MODEL.VIT.USE_MEAN_POOLING = False + +# ----------------------------------------------------------------------------- +# Training settings +# ----------------------------------------------------------------------------- +_C.TRAIN = CN() +_C.TRAIN.START_EPOCH = 0 +_C.TRAIN.EPOCHS = 300 +_C.TRAIN.WARMUP_EPOCHS = 20 +_C.TRAIN.WEIGHT_DECAY = 0.05 +_C.TRAIN.BASE_LR = 5e-4 +_C.TRAIN.WARMUP_LR = 5e-7 +_C.TRAIN.MIN_LR = 5e-6 +# Clip gradient norm +_C.TRAIN.CLIP_GRAD = 5.0 +# Auto resume from latest checkpoint +_C.TRAIN.AUTO_RESUME = True +# Gradient accumulation steps +# could be overwritten by command line argument +_C.TRAIN.ACCUMULATION_STEPS = 0 +# Whether to use gradient checkpointing to save memory +# could be overwritten by command line argument +_C.TRAIN.USE_CHECKPOINT = False + +# LR scheduler +_C.TRAIN.LR_SCHEDULER = CN() +_C.TRAIN.LR_SCHEDULER.NAME = 'cosine' +# Epoch interval to decay LR, used in StepLRScheduler +_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 +# LR decay rate, used in StepLRScheduler +_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 +# Gamma / Multi steps value, used in MultiStepLRScheduler +_C.TRAIN.LR_SCHEDULER.GAMMA = 0.1 +_C.TRAIN.LR_SCHEDULER.MULTISTEPS = [] + +# Optimizer +_C.TRAIN.OPTIMIZER = CN() +_C.TRAIN.OPTIMIZER.NAME = 'adamw' +# Optimizer Epsilon +_C.TRAIN.OPTIMIZER.EPS = 1e-8 +# Optimizer Betas +_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999) +# SGD momentum +_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9 + +# [SimMIM] Layer decay for fine-tuning +_C.TRAIN.LAYER_DECAY = 1.0 + +# ----------------------------------------------------------------------------- +# Augmentation settings +# ----------------------------------------------------------------------------- +_C.AUG = CN() +# Color jitter factor +_C.AUG.COLOR_JITTER = 0.4 +# Use AutoAugment policy. "v0" or "original" +_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' +# Random erase prob +_C.AUG.REPROB = 0.25 +# Random erase mode +_C.AUG.REMODE = 'pixel' +# Random erase count +_C.AUG.RECOUNT = 1 +# Mixup alpha, mixup enabled if > 0 +_C.AUG.MIXUP = 0.8 +# Cutmix alpha, cutmix enabled if > 0 +_C.AUG.CUTMIX = 1.0 +# Cutmix min/max ratio, overrides alpha and enables cutmix if set +_C.AUG.CUTMIX_MINMAX = None +# Probability of performing mixup or cutmix when either/both is enabled +_C.AUG.MIXUP_PROB = 1.0 +# Probability of switching to cutmix when both mixup and cutmix enabled +_C.AUG.MIXUP_SWITCH_PROB = 0.5 +# How to apply mixup/cutmix params. Per "batch", "pair", or "elem" +_C.AUG.MIXUP_MODE = 'batch' + +# ----------------------------------------------------------------------------- +# Testing settings +# ----------------------------------------------------------------------------- +_C.TEST = CN() +# Whether to use center crop when testing +_C.TEST.CROP = True + +# ----------------------------------------------------------------------------- +# Misc +# ----------------------------------------------------------------------------- +# Mixed precision opt level, if O0, no amp is used ('O0', 'O1', 'O2') +# overwritten by command line argument +_C.AMP_OPT_LEVEL = '' +# Path to output folder, overwritten by command line argument +_C.OUTPUT = '' +# Tag of experiment, overwritten by command line argument +_C.TAG = 'default' +# Frequency to save checkpoint +_C.SAVE_FREQ = 1 +# Frequency to logging info +_C.PRINT_FREQ = 10 +# Fixed random seed +_C.SEED = 0 +# Perform evaluation only, overwritten by command line argument +_C.EVAL_MODE = False +# Test throughput only, overwritten by command line argument +_C.THROUGHPUT_MODE = False +# local rank for DistributedDataParallel, given by command line argument +_C.LOCAL_RANK = 0 + +# [SimMIM] path to pre-trained model +_C.PRETRAINED = '' + + +def _update_config_from_file(config, cfg_file): + config.defrost() + with open(cfg_file, 'r') as f: + yaml_cfg = yaml.load(f, Loader=yaml.FullLoader) + + for cfg in yaml_cfg.setdefault('BASE', ['']): + if cfg: + _update_config_from_file( + config, os.path.join(os.path.dirname(cfg_file), cfg) + ) + print('=> merge config from {}'.format(cfg_file)) + config.merge_from_file(cfg_file) + config.freeze() + + +def update_config(config, args): + _update_config_from_file(config, args.cfg) + + config.defrost() + if args.opts: + config.merge_from_list(args.opts) + + def _check_args(name): + if hasattr(args, name) and eval(f'args.{name}'): + return True + return False + + # merge from specific arguments + if _check_args('batch_size'): + config.DATA.BATCH_SIZE = args.batch_size + if _check_args('data_path'): + config.DATA.DATA_PATH = args.data_path + if _check_args('resume'): + config.MODEL.RESUME = args.resume + if _check_args('pretrained'): + config.PRETRAINED = args.pretrained + if _check_args('accumulation_steps'): + config.TRAIN.ACCUMULATION_STEPS = args.accumulation_steps + if _check_args('use_checkpoint'): + config.TRAIN.USE_CHECKPOINT = True + if _check_args('amp_opt_level'): + config.AMP_OPT_LEVEL = args.amp_opt_level + if _check_args('output'): + config.OUTPUT = args.output + if _check_args('tag'): + config.TAG = args.tag + if _check_args('eval'): + config.EVAL_MODE = True + if _check_args('throughput'): + config.THROUGHPUT_MODE = True + + # set local rank for distributed training + config.LOCAL_RANK = args.local_rank + + # output folder + config.OUTPUT = os.path.join(config.OUTPUT, config.MODEL.NAME, config.TAG) + + config.freeze() + + +def get_config(args): + """Get a yacs CfgNode object with default values.""" + # Return a clone so that the defaults will not be altered + # This is for the "local variable" use pattern + config = _C.clone() + update_config(config, args) + + return config diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_base__100ep/simmim_finetune__swin_base__img192_window6__100ep.yaml b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_base__100ep/simmim_finetune__swin_base__img192_window6__100ep.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fdb6877d28ec1c80b4e340b93114cef4cd2f6a08 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_base__100ep/simmim_finetune__swin_base__img192_window6__100ep.yaml @@ -0,0 +1,22 @@ +MODEL: + TYPE: swin + NAME: simmim_finetune + DROP_PATH_RATE: 0.1 + SWIN: + EMBED_DIM: 128 + DEPTHS: [ 2, 2, 18, 2 ] + NUM_HEADS: [ 4, 8, 16, 32 ] + WINDOW_SIZE: 6 +DATA: + IMG_SIZE: 192 +TRAIN: + EPOCHS: 100 + WARMUP_EPOCHS: 20 + BASE_LR: 1.25e-3 + WARMUP_LR: 2.5e-7 + MIN_LR: 2.5e-7 + WEIGHT_DECAY: 0.05 + LAYER_DECAY: 0.9 +PRINT_FREQ: 100 +SAVE_FREQ: 5 +TAG: simmim_finetune__swin_base__img192_window6__100ep \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_base__100ep/simmim_finetune__swin_base__img224_window7__100ep.yaml b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_base__100ep/simmim_finetune__swin_base__img224_window7__100ep.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fb79113b104ab1b3a634e6971093cd446a8c1dc0 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_base__100ep/simmim_finetune__swin_base__img224_window7__100ep.yaml @@ -0,0 +1,22 @@ +MODEL: + TYPE: swin + NAME: simmim_finetune + DROP_PATH_RATE: 0.1 + SWIN: + EMBED_DIM: 128 + DEPTHS: [ 2, 2, 18, 2 ] + NUM_HEADS: [ 4, 8, 16, 32 ] + WINDOW_SIZE: 7 +DATA: + IMG_SIZE: 224 +TRAIN: + EPOCHS: 100 + WARMUP_EPOCHS: 20 + BASE_LR: 1.25e-3 + WARMUP_LR: 2.5e-7 + MIN_LR: 2.5e-7 + WEIGHT_DECAY: 0.05 + LAYER_DECAY: 0.9 +PRINT_FREQ: 100 +SAVE_FREQ: 5 +TAG: simmim_finetune__swin_base__img224_window7__100ep \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_base__100ep/simmim_pretrain__swin_base__img192_window6__100ep.yaml b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_base__100ep/simmim_pretrain__swin_base__img192_window6__100ep.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b2ea0167781b6ecbb48028eba095a3f926b6123e --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_base__100ep/simmim_pretrain__swin_base__img192_window6__100ep.yaml @@ -0,0 +1,23 @@ +MODEL: + TYPE: swin + NAME: simmim_pretrain + DROP_PATH_RATE: 0.0 + SWIN: + EMBED_DIM: 128 + DEPTHS: [ 2, 2, 18, 2 ] + NUM_HEADS: [ 4, 8, 16, 32 ] + WINDOW_SIZE: 6 +DATA: + IMG_SIZE: 192 + MASK_PATCH_SIZE: 32 + MASK_RATIO: 0.6 +TRAIN: + EPOCHS: 100 + WARMUP_EPOCHS: 10 + BASE_LR: 2e-4 + WARMUP_LR: 1e-6 + MIN_LR: 1e-5 + WEIGHT_DECAY: 0.05 +PRINT_FREQ: 100 +SAVE_FREQ: 5 +TAG: simmim_pretrain__swin_base__img192_window6__100ep \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_base__800ep/simmim_finetune__swin_base__img224_window7__800ep.yaml b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_base__800ep/simmim_finetune__swin_base__img224_window7__800ep.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b13906708121f455586a6bddc5a259db4d82a894 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_base__800ep/simmim_finetune__swin_base__img224_window7__800ep.yaml @@ -0,0 +1,22 @@ +MODEL: + TYPE: swin + NAME: simmim_finetune + DROP_PATH_RATE: 0.1 + SWIN: + EMBED_DIM: 128 + DEPTHS: [ 2, 2, 18, 2 ] + NUM_HEADS: [ 4, 8, 16, 32 ] + WINDOW_SIZE: 7 +DATA: + IMG_SIZE: 224 +TRAIN: + EPOCHS: 100 + WARMUP_EPOCHS: 20 + BASE_LR: 1.25e-3 + WARMUP_LR: 2.5e-7 + MIN_LR: 2.5e-7 + WEIGHT_DECAY: 0.05 + LAYER_DECAY: 0.8 +PRINT_FREQ: 100 +SAVE_FREQ: 5 +TAG: simmim_finetune__swin_base__img224_window7__800ep \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_base__800ep/simmim_pretrain__swin_base__img192_window6__800ep.yaml b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_base__800ep/simmim_pretrain__swin_base__img192_window6__800ep.yaml new file mode 100644 index 0000000000000000000000000000000000000000..92da90aba803c0c268dc4cd896dac2c7c4b01dee --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_base__800ep/simmim_pretrain__swin_base__img192_window6__800ep.yaml @@ -0,0 +1,26 @@ +MODEL: + TYPE: swin + NAME: simmim_pretrain + DROP_PATH_RATE: 0.0 + SWIN: + EMBED_DIM: 128 + DEPTHS: [ 2, 2, 18, 2 ] + NUM_HEADS: [ 4, 8, 16, 32 ] + WINDOW_SIZE: 6 +DATA: + IMG_SIZE: 192 + MASK_PATCH_SIZE: 32 + MASK_RATIO: 0.6 +TRAIN: + EPOCHS: 800 + WARMUP_EPOCHS: 10 + BASE_LR: 1e-4 + WARMUP_LR: 5e-7 + WEIGHT_DECAY: 0.05 + LR_SCHEDULER: + NAME: 'multistep' + GAMMA: 0.1 + MULTISTEPS: [700,] +PRINT_FREQ: 100 +SAVE_FREQ: 5 +TAG: simmim_pretrain__swin_base__img192_window6__800ep \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_large__800ep/simmim_finetune__swin_large__img224_window14__800ep.yaml b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_large__800ep/simmim_finetune__swin_large__img224_window14__800ep.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3e805413dba928f4c96c90dda9ecacdc639809b3 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_large__800ep/simmim_finetune__swin_large__img224_window14__800ep.yaml @@ -0,0 +1,22 @@ +MODEL: + TYPE: swin + NAME: simmim_finetune + DROP_PATH_RATE: 0.2 + SWIN: + EMBED_DIM: 192 + DEPTHS: [ 2, 2, 18, 2 ] + NUM_HEADS: [ 6, 12, 24, 48 ] + WINDOW_SIZE: 14 +DATA: + IMG_SIZE: 224 +TRAIN: + EPOCHS: 100 + WARMUP_EPOCHS: 20 + BASE_LR: 1.25e-3 + WARMUP_LR: 2.5e-7 + MIN_LR: 2.5e-7 + WEIGHT_DECAY: 0.05 + LAYER_DECAY: 0.7 +PRINT_FREQ: 100 +SAVE_FREQ: 5 +TAG: simmim_finetune__swin_large__img224_window14__800ep \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_large__800ep/simmim_pretrain__swin_large__img192_window12__800ep.yaml b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_large__800ep/simmim_pretrain__swin_large__img192_window12__800ep.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5494530a91d586e1dd3de4aeb1229724b41966f5 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/swin_large__800ep/simmim_pretrain__swin_large__img192_window12__800ep.yaml @@ -0,0 +1,26 @@ +MODEL: + TYPE: swin + NAME: simmim_pretrain + DROP_PATH_RATE: 0.0 + SWIN: + EMBED_DIM: 192 + DEPTHS: [ 2, 2, 18, 2 ] + NUM_HEADS: [ 6, 12, 24, 48 ] + WINDOW_SIZE: 12 +DATA: + IMG_SIZE: 192 + MASK_PATCH_SIZE: 32 + MASK_RATIO: 0.6 +TRAIN: + EPOCHS: 800 + WARMUP_EPOCHS: 10 + BASE_LR: 1e-4 + WARMUP_LR: 5e-7 + WEIGHT_DECAY: 0.05 + LR_SCHEDULER: + NAME: 'multistep' + GAMMA: 0.1 + MULTISTEPS: [700,] +PRINT_FREQ: 100 +SAVE_FREQ: 5 +TAG: simmim_pretrain__swin_large__img192_window12__800ep \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/vit_base__800ep/simmim_finetune__vit_base__img224__800ep.yaml b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/vit_base__800ep/simmim_finetune__vit_base__img224__800ep.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6fcc5fc965e7ecc375dc10b37b98e665d487296f --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/vit_base__800ep/simmim_finetune__vit_base__img224__800ep.yaml @@ -0,0 +1,25 @@ +MODEL: + TYPE: vit + NAME: simmim_finetune + DROP_PATH_RATE: 0.1 + VIT: + EMBED_DIM: 768 + DEPTH: 12 + NUM_HEADS: 12 + USE_APE: False + USE_RPB: True + USE_SHARED_RPB: False + USE_MEAN_POOLING: True +DATA: + IMG_SIZE: 224 +TRAIN: + EPOCHS: 100 + WARMUP_EPOCHS: 20 + BASE_LR: 1.25e-3 + WARMUP_LR: 2.5e-7 + MIN_LR: 2.5e-7 + WEIGHT_DECAY: 0.05 + LAYER_DECAY: 0.65 +PRINT_FREQ: 100 +SAVE_FREQ: 5 +TAG: simmim_finetune__vit_base__img224__800ep diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/vit_base__800ep/simmim_pretrain__vit_base__img224__800ep.yaml b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/vit_base__800ep/simmim_pretrain__vit_base__img224__800ep.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3ac2bbda547430b906793e45c8720f0c2ee7ffa3 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/vit_base__800ep/simmim_pretrain__vit_base__img224__800ep.yaml @@ -0,0 +1,29 @@ +MODEL: + TYPE: vit + NAME: simmim_pretrain + DROP_PATH_RATE: 0.1 + VIT: + EMBED_DIM: 768 + DEPTH: 12 + NUM_HEADS: 12 + USE_APE: False + USE_RPB: False + USE_SHARED_RPB: True + USE_MEAN_POOLING: False +DATA: + IMG_SIZE: 224 + MASK_PATCH_SIZE: 32 + MASK_RATIO: 0.6 +TRAIN: + EPOCHS: 800 + WARMUP_EPOCHS: 10 + BASE_LR: 1e-4 + WARMUP_LR: 5e-7 + WEIGHT_DECAY: 0.05 + LR_SCHEDULER: + NAME: 'multistep' + GAMMA: 0.1 + MULTISTEPS: [700,] +PRINT_FREQ: 100 +SAVE_FREQ: 5 +TAG: simmim_pretrain__vit_base__img224__800ep diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/vit_base__test/simmim_pretrain__vit_base__img224__100ep.yaml b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/vit_base__test/simmim_pretrain__vit_base__img224__100ep.yaml new file mode 100644 index 0000000000000000000000000000000000000000..018ff7981f1b9be23e24975653758043e6ecb793 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/configs/vit_base__test/simmim_pretrain__vit_base__img224__100ep.yaml @@ -0,0 +1,31 @@ +MODEL: + TYPE: vit + NAME: simmim_pretrain + DROP_PATH_RATE: 0.0 + VIT: + EMBED_DIM: 768 + DEPTH: 12 + NUM_HEADS: 12 + USE_APE: True + USE_RPB: False + USE_SHARED_RPB: True + USE_MEAN_POOLING: False + QKV_BIAS: True + INIT_VALUES: None +DATA: + IMG_SIZE: 224 + MASK_PATCH_SIZE: 32 + MASK_RATIO: 0.6 +TRAIN: + EPOCHS: 200 + WARMUP_EPOCHS: 20 + BASE_LR: 2e-4 + WARMUP_LR: 1e-6 + WEIGHT_DECAY: 0.05 + LR_SCHEDULER: + NAME: 'multistep' + GAMMA: 0.1 + MULTISTEPS: [700,] +PRINT_FREQ: 500 +SAVE_FREQ: 20 +TAG: simmim_pretrain__vit_base__img224__800ep diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/data/__init__.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..95e540099ae2829a4580e84c069d021035afad9d --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/data/__init__.py @@ -0,0 +1,8 @@ +from .data_simmim import build_loader_simmim +from .data_finetune import build_loader_finetune + +def build_loader(config, logger, is_pretrain): + if is_pretrain: + return build_loader_simmim(config, logger) + else: + return build_loader_finetune(config, logger) \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/data/data_finetune.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/data/data_finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..70d32195ecba14cd385acc81ee0f43ed57082e75 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/data/data_finetune.py @@ -0,0 +1,115 @@ +# -------------------------------------------------------- +# SimMIM +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Zhenda Xie +# -------------------------------------------------------- + +import os +import torch.distributed as dist +from torch.utils.data import DataLoader, DistributedSampler +from torchvision import datasets, transforms +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from timm.data import Mixup +from timm.data import create_transform +# from timm.data.transforms import _pil_interp +from timm.data.transforms import str_to_pil_interp + + +def build_loader_finetune(config, logger): + config.defrost() + dataset_train, config.MODEL.NUM_CLASSES = build_dataset(is_train=True, config=config, logger=logger) + config.freeze() + dataset_val, _ = build_dataset(is_train=False, config=config, logger=logger) + logger.info(f"Build dataset: train images = {len(dataset_train)}, val images = {len(dataset_val)}") + + num_tasks = dist.get_world_size() + global_rank = dist.get_rank() + sampler_train = DistributedSampler( + dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True + ) + sampler_val = DistributedSampler( + dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False + ) + + data_loader_train = DataLoader( + dataset_train, sampler=sampler_train, + batch_size=config.DATA.BATCH_SIZE, + num_workers=config.DATA.NUM_WORKERS, + pin_memory=config.DATA.PIN_MEMORY, + drop_last=True, + ) + + data_loader_val = DataLoader( + dataset_val, sampler=sampler_val, + batch_size=config.DATA.BATCH_SIZE, + num_workers=config.DATA.NUM_WORKERS, + pin_memory=config.DATA.PIN_MEMORY, + drop_last=False, + ) + + # setup mixup / cutmix + mixup_fn = None + mixup_active = config.AUG.MIXUP > 0 or config.AUG.CUTMIX > 0. or config.AUG.CUTMIX_MINMAX is not None + if mixup_active: + mixup_fn = Mixup( + mixup_alpha=config.AUG.MIXUP, cutmix_alpha=config.AUG.CUTMIX, cutmix_minmax=config.AUG.CUTMIX_MINMAX, + prob=config.AUG.MIXUP_PROB, switch_prob=config.AUG.MIXUP_SWITCH_PROB, mode=config.AUG.MIXUP_MODE, + label_smoothing=config.MODEL.LABEL_SMOOTHING, num_classes=config.MODEL.NUM_CLASSES) + + return dataset_train, dataset_val, data_loader_train, data_loader_val, mixup_fn + + +def build_dataset(is_train, config, logger): + transform = build_transform(is_train, config) + logger.info(f'Fine-tune data transform, is_train={is_train}:\n{transform}') + + if config.DATA.DATASET == 'imagenet': + prefix = 'train' if is_train else 'val' + root = os.path.join(config.DATA.DATA_PATH, prefix) + dataset = datasets.ImageFolder(root, transform=transform) + nb_classes = 1000 + else: + raise NotImplementedError("We only support ImageNet Now.") + + return dataset, nb_classes + + +def build_transform(is_train, config): + resize_im = config.DATA.IMG_SIZE > 32 + if is_train: + # this should always dispatch to transforms_imagenet_train + transform = create_transform( + input_size=config.DATA.IMG_SIZE, + is_training=True, + color_jitter=config.AUG.COLOR_JITTER if config.AUG.COLOR_JITTER > 0 else None, + auto_augment=config.AUG.AUTO_AUGMENT if config.AUG.AUTO_AUGMENT != 'none' else None, + re_prob=config.AUG.REPROB, + re_mode=config.AUG.REMODE, + re_count=config.AUG.RECOUNT, + interpolation=config.DATA.INTERPOLATION, + ) + if not resize_im: + # replace RandomResizedCropAndInterpolation with + # RandomCrop + transform.transforms[0] = transforms.RandomCrop(config.DATA.IMG_SIZE, padding=4) + return transform + + t = [] + if resize_im: + if config.TEST.CROP: + size = int((256 / 224) * config.DATA.IMG_SIZE) + t.append( + transforms.Resize(size, interpolation=str_to_pil_interp(config.DATA.INTERPOLATION)), + # to maintain same ratio w.r.t. 224 images + ) + t.append(transforms.CenterCrop(config.DATA.IMG_SIZE)) + else: + t.append( + transforms.Resize((config.DATA.IMG_SIZE, config.DATA.IMG_SIZE), + interpolation=str_to_pil_interp(config.DATA.INTERPOLATION)) + ) + + t.append(transforms.ToTensor()) + t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)) + return transforms.Compose(t) \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/data/data_simmim.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/data/data_simmim.py new file mode 100644 index 0000000000000000000000000000000000000000..89f468f42641e75cbcd01d51c55fd3d5efda7313 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/data/data_simmim.py @@ -0,0 +1,104 @@ +# -------------------------------------------------------- +# SimMIM +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Zhenda Xie +# -------------------------------------------------------- + +import math +import random +import numpy as np + +import torch +import torch.distributed as dist +import torchvision.transforms as T +from torch.utils.data import DataLoader, DistributedSampler +from torch.utils.data._utils.collate import default_collate +from torchvision.datasets import ImageFolder +from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD + + +class MaskGenerator: + def __init__(self, input_size=192, mask_patch_size=32, model_patch_size=4, mask_ratio=0.6): + self.input_size = input_size + self.mask_patch_size = mask_patch_size + self.model_patch_size = model_patch_size + self.mask_ratio = mask_ratio + + assert self.input_size % self.mask_patch_size == 0 + assert self.mask_patch_size % self.model_patch_size == 0 + + self.rand_size = self.input_size // self.mask_patch_size + self.scale = self.mask_patch_size // self.model_patch_size + + self.token_count = self.rand_size ** 2 + self.mask_count = int(np.ceil(self.token_count * self.mask_ratio)) + + def __call__(self): + mask_idx = np.random.permutation(self.token_count)[:self.mask_count] + mask = np.zeros(self.token_count, dtype=int) + mask[mask_idx] = 1 + + mask = mask.reshape((self.rand_size, self.rand_size)) + mask = mask.repeat(self.scale, axis=0).repeat(self.scale, axis=1) + + return mask + + +class SimMIMTransform: + def __init__(self, config): + self.transform_img = T.Compose([ + T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), + T.RandomResizedCrop(config.DATA.IMG_SIZE, scale=(0.67, 1.), ratio=(3. / 4., 4. / 3.)), + T.RandomHorizontalFlip(), + T.ToTensor(), + T.Normalize(mean=torch.tensor(IMAGENET_DEFAULT_MEAN),std=torch.tensor(IMAGENET_DEFAULT_STD)), + ]) + + if config.MODEL.TYPE == 'swin': + model_patch_size=config.MODEL.SWIN.PATCH_SIZE + elif config.MODEL.TYPE == 'vit': + model_patch_size=config.MODEL.VIT.PATCH_SIZE + else: + raise NotImplementedError + + self.mask_generator = MaskGenerator( + input_size=config.DATA.IMG_SIZE, + mask_patch_size=config.DATA.MASK_PATCH_SIZE, + model_patch_size=model_patch_size, + mask_ratio=config.DATA.MASK_RATIO, + ) + + def __call__(self, img): + img = self.transform_img(img) + mask = self.mask_generator() + + return img, mask + + +def collate_fn(batch): + if not isinstance(batch[0][0], tuple): + return default_collate(batch) + else: + batch_num = len(batch) + ret = [] + for item_idx in range(len(batch[0][0])): + if batch[0][0][item_idx] is None: + ret.append(None) + else: + ret.append(default_collate([batch[i][0][item_idx] for i in range(batch_num)])) + ret.append(default_collate([batch[i][1] for i in range(batch_num)])) + return ret + + +def build_loader_simmim(config, logger): + transform = SimMIMTransform(config) + logger.info(f'Pre-train data transform:\n{transform}') + + dataset = ImageFolder(config.DATA.DATA_PATH, transform) + logger.info(f'Build dataset: train images = {len(dataset)}') + + sampler = DistributedSampler(dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=True) + dataloader = DataLoader(dataset, config.DATA.BATCH_SIZE, sampler=sampler, num_workers=config.DATA.NUM_WORKERS, pin_memory=True, drop_last=True, collate_fn=collate_fn) + + return dataloader \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/figures/teaser.jpg b/PuzzleTuning/Counterpart PreTrain Methods/simmim/figures/teaser.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3e116f28d33aad4a72f4a42f1b1e57b065fe1bca Binary files /dev/null and b/PuzzleTuning/Counterpart PreTrain Methods/simmim/figures/teaser.jpg differ diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/load_test.ipynb b/PuzzleTuning/Counterpart PreTrain Methods/simmim/load_test.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..1aa43e326f7e6c0ddc6f5a2c8092381687de8ec0 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/load_test.ipynb @@ -0,0 +1,61 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/miniconda3/envs/SimMIM/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving backbone init weight to ./model_base...\n" + ] + } + ], + "source": [ + "from load_vit_from_ckpt import gen_basic_weight\n", + "import os\n", + "\n", + "\n", + "base_weight_pth = './model_base'\n", + "gen_basic_weight(base_weight_pth)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "SimMIM", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "be8d61690b9c27505fb56c69c6c249490f4cb538c6e1d60f116d2e57d82ff881" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/load_vit_from_ckpt.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/load_vit_from_ckpt.py new file mode 100644 index 0000000000000000000000000000000000000000..6fcb2a1a271b45de62d079bf28da92bb608a427e --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/load_vit_from_ckpt.py @@ -0,0 +1,122 @@ +""" +Extracting backbone from a specified SimMIM checkpoint. + +Example: + +python load_vit_from_ckpt.py \ + --checkpoint ./output/simmim_pretrain/vit_run/ckpt_epoch_99.pth \ + --save-to ./output/models/ \ + --save-name vit_simmim_16_224.pth \ + --num-classes 2 +""" + +import torchvision +import torch +import os +import argparse +from timm import create_model +# from net.models.vit import VisionTransformer + + +def gen_basic_weight(save_dir): + # Load timm vit weight + model = create_model('vit_base_patch16_224', pretrained=False, in_chans=3) + random_state_dict = model.state_dict() + + model = create_model('vit_base_patch16_224', pretrained=True, in_chans=3) + pretrained_state_dict = model.state_dict() + + # Save model + print(f'Saving backbone init weight to {save_dir}...') + if not os.path.exists(save_dir): + os.makedirs(save_dir) + torch.save(random_state_dict, os.path.join(save_dir, 'ViT_b16_224_Random_Init.pth')) + torch.save(pretrained_state_dict, os.path.join(save_dir, 'ViT_b16_224_Imagenet.pth')) + + +def main(args): + """Read ViT parameters from BYOL backbone + """ + + # Initialize model + if args.basic_weight: + model = create_model('vit_base_patch16_224', pretrained=False, in_chans=3) + # model = VisionTransformer(num_classes=args.num_classes) + + # Load basic weights (default initial parameters) + basic_weight = torch.load(args.basic_weight) + model.load_state_dict(basic_weight, strict=False) + else: + raise + model = create_model('vit_base_patch16_224', pretrained=True, in_chans=3) + + # Load checkpoint + # state_dict = torch.load(args.checkpoint)['state_dict'] + checkpoint = torch.load(args.checkpoint) + ckp_state_dict = checkpoint['model'] + model_state_dict = model.state_dict() + + print('checking checkpoint weights...') + len_state_dict = len(ckp_state_dict) + for seq, src_k in enumerate(ckp_state_dict.keys()): + if "encoder." in src_k: + tgt_k = str(src_k).replace("encoder.", "") + if tgt_k not in model_state_dict.keys(): + print(f'{seq+1}/{len_state_dict} Skipped: {src_k}, {ckp_state_dict[src_k].shape}') + + print('loading weights...') + len_state_dict = len(model_state_dict) + for seq, tgt_k in enumerate(model_state_dict.keys()): + src_k = "encoder." + str(tgt_k) + if src_k in ckp_state_dict: + model_state_dict[tgt_k] = ckp_state_dict[src_k] + else: + print(f'{seq+1}/{len_state_dict} Skipped: {tgt_k}') + + model.load_state_dict(model_state_dict, strict=False) + + # Save model + print(f'Saving model to {args.save_to}...') + if not os.path.exists(args.save_to): + os.makedirs(args.save_to) + torch.save(model.state_dict(), os.path.join(args.save_to, args.save_name)) + + +def get_args_parser(): + """Input parameters + """ + parser = argparse.ArgumentParser(description='Extract backbone state dict') + parser.add_argument('--checkpoint', default='./checkpoint_0004.pth.tar', type=str, required=True, + help='Path to the checkpoint') + parser.add_argument('--save-to', default='./output', type=str, required=True, + help='Where to save the model') + parser.add_argument('--save-name', default='vit_simmim_16_224.pth', type=str, required=True, + help='Model save name') + parser.add_argument('--num-classes', default=2, type=int, + help='Number of classes to be classified') + parser.add_argument('--random-seed', default=42, type=int, + help='Random seed (enable reproduction)') + parser.add_argument('--basic-weight', default='', type=str, + help='Basic weight (used to init parameters)') + return parser + + +def setup_seed(seed): + """Fix up the random seed + + Args: + seed (int): Seed to be applied + """ + import random + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + random.seed(seed) + torch.backends.cudnn.deterministic = True + + +if __name__ == '__main__': + parser = get_args_parser() + args = parser.parse_args() + + setup_seed(args.random_seed) + main(args) \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/logger.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..f7d95b21648bd00a57ff7d0107064d425308a580 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/logger.py @@ -0,0 +1,42 @@ +# -------------------------------------------------------- +# SimMIM +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu +# Modified by Zhenda Xie +# -------------------------------------------------------- + +import os +import sys +import logging +import functools +from termcolor import colored + + +@functools.lru_cache() +def create_logger(output_dir, dist_rank=0, name=''): + # create logger + logger = logging.getLogger(name) + logger.setLevel(logging.DEBUG) + logger.propagate = False + + # create formatter + fmt = '[%(asctime)s %(name)s] (%(filename)s %(lineno)d): %(levelname)s %(message)s' + color_fmt = colored('[%(asctime)s %(name)s]', 'green') + \ + colored('(%(filename)s %(lineno)d)', 'yellow') + ': %(levelname)s %(message)s' + + # create console handlers for master process + if dist_rank == 0: + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(logging.DEBUG) + console_handler.setFormatter( + logging.Formatter(fmt=color_fmt, datefmt='%Y-%m-%d %H:%M:%S')) + logger.addHandler(console_handler) + + # create file handlers + file_handler = logging.FileHandler(os.path.join(output_dir, f'log_rank{dist_rank}.txt'), mode='a') + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S')) + logger.addHandler(file_handler) + + return logger diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/lr_scheduler.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/lr_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..ee27b8cd5867c849e1f2d9eead752c49990fcb55 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/lr_scheduler.py @@ -0,0 +1,153 @@ +# -------------------------------------------------------- +# SimMIM +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu +# Modified by Zhenda Xie +# -------------------------------------------------------- + +from collections import Counter +from bisect import bisect_right + +import torch +from timm.scheduler.cosine_lr import CosineLRScheduler +from timm.scheduler.step_lr import StepLRScheduler +from timm.scheduler.scheduler import Scheduler + + +def build_scheduler(config, optimizer, n_iter_per_epoch): + num_steps = int(config.TRAIN.EPOCHS * n_iter_per_epoch) + warmup_steps = int(config.TRAIN.WARMUP_EPOCHS * n_iter_per_epoch) + decay_steps = int(config.TRAIN.LR_SCHEDULER.DECAY_EPOCHS * n_iter_per_epoch) + multi_steps = [i * n_iter_per_epoch for i in config.TRAIN.LR_SCHEDULER.MULTISTEPS] + + lr_scheduler = None + if config.TRAIN.LR_SCHEDULER.NAME == 'cosine': + lr_scheduler = CosineLRScheduler( + optimizer, + t_initial=num_steps, + t_mul=1., + lr_min=config.TRAIN.MIN_LR, + warmup_lr_init=config.TRAIN.WARMUP_LR, + warmup_t=warmup_steps, + cycle_limit=1, + t_in_epochs=False, + ) + elif config.TRAIN.LR_SCHEDULER.NAME == 'linear': + lr_scheduler = LinearLRScheduler( + optimizer, + t_initial=num_steps, + lr_min_rate=0.01, + warmup_lr_init=config.TRAIN.WARMUP_LR, + warmup_t=warmup_steps, + t_in_epochs=False, + ) + elif config.TRAIN.LR_SCHEDULER.NAME == 'step': + lr_scheduler = StepLRScheduler( + optimizer, + decay_t=decay_steps, + decay_rate=config.TRAIN.LR_SCHEDULER.DECAY_RATE, + warmup_lr_init=config.TRAIN.WARMUP_LR, + warmup_t=warmup_steps, + t_in_epochs=False, + ) + elif config.TRAIN.LR_SCHEDULER.NAME == 'multistep': + lr_scheduler = MultiStepLRScheduler( + optimizer, + milestones=multi_steps, + gamma=config.TRAIN.LR_SCHEDULER.GAMMA, + warmup_lr_init=config.TRAIN.WARMUP_LR, + warmup_t=warmup_steps, + t_in_epochs=False, + ) + + return lr_scheduler + + +class LinearLRScheduler(Scheduler): + def __init__(self, + optimizer: torch.optim.Optimizer, + t_initial: int, + lr_min_rate: float, + warmup_t=0, + warmup_lr_init=0., + t_in_epochs=True, + noise_range_t=None, + noise_pct=0.67, + noise_std=1.0, + noise_seed=42, + initialize=True, + ) -> None: + super().__init__( + optimizer, param_group_field="lr", + noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed, + initialize=initialize) + + self.t_initial = t_initial + self.lr_min_rate = lr_min_rate + self.warmup_t = warmup_t + self.warmup_lr_init = warmup_lr_init + self.t_in_epochs = t_in_epochs + if self.warmup_t: + self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values] + super().update_groups(self.warmup_lr_init) + else: + self.warmup_steps = [1 for _ in self.base_values] + + def _get_lr(self, t): + if t < self.warmup_t: + lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps] + else: + t = t - self.warmup_t + total_t = self.t_initial - self.warmup_t + lrs = [v - ((v - v * self.lr_min_rate) * (t / total_t)) for v in self.base_values] + return lrs + + def get_epoch_values(self, epoch: int): + if self.t_in_epochs: + return self._get_lr(epoch) + else: + return None + + def get_update_values(self, num_updates: int): + if not self.t_in_epochs: + return self._get_lr(num_updates) + else: + return None + + +class MultiStepLRScheduler(Scheduler): + def __init__(self, optimizer: torch.optim.Optimizer, milestones, gamma=0.1, warmup_t=0, warmup_lr_init=0, t_in_epochs=True) -> None: + super().__init__(optimizer, param_group_field="lr") + + self.milestones = milestones + self.gamma = gamma + self.warmup_t = warmup_t + self.warmup_lr_init = warmup_lr_init + self.t_in_epochs = t_in_epochs + if self.warmup_t: + self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values] + super().update_groups(self.warmup_lr_init) + else: + self.warmup_steps = [1 for _ in self.base_values] + + assert self.warmup_t <= min(self.milestones) + + def _get_lr(self, t): + if t < self.warmup_t: + lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps] + else: + lrs = [v * (self.gamma ** bisect_right(self.milestones, t)) for v in self.base_values] + return lrs + + def get_epoch_values(self, epoch: int): + if self.t_in_epochs: + return self._get_lr(epoch) + else: + return None + + def get_update_values(self, num_updates: int): + if not self.t_in_epochs: + return self._get_lr(num_updates) + else: + return None \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/main_finetune.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/main_finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..3ca947a9f020e4fb2b7ae4ab89bb16c8bfb3af08 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/main_finetune.py @@ -0,0 +1,348 @@ +# -------------------------------------------------------- +# SimMIM +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu +# Modified by Zhenda Xie +# -------------------------------------------------------- + +import os +import time +import argparse +import datetime +import numpy as np + +import torch +import torch.backends.cudnn as cudnn +import torch.distributed as dist + +from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy +from timm.utils import accuracy, AverageMeter + +from config import get_config +from models import build_model +from data import build_loader +from lr_scheduler import build_scheduler +from optimizer import build_optimizer +from logger import create_logger +from utils import load_checkpoint, load_pretrained, save_checkpoint, get_grad_norm, auto_resume_helper, reduce_tensor + +try: + # noinspection PyUnresolvedReferences + from apex import amp +except ImportError: + amp = None + + +def parse_option(): + parser = argparse.ArgumentParser('Swin Transformer training and evaluation script', add_help=False) + parser.add_argument('--cfg', type=str, required=True, metavar="FILE", help='path to config file', ) + parser.add_argument( + "--opts", + help="Modify config options by adding 'KEY VALUE' pairs. ", + default=None, + nargs='+', + ) + + # easy config modification + parser.add_argument('--batch-size', type=int, help="batch size for single GPU") + parser.add_argument('--data-path', type=str, help='path to dataset') + parser.add_argument('--pretrained', type=str, help='path to pre-trained model') + parser.add_argument('--resume', help='resume from checkpoint') + parser.add_argument('--accumulation-steps', type=int, help="gradient accumulation steps") + parser.add_argument('--use-checkpoint', action='store_true', + help="whether to use gradient checkpointing to save memory") + parser.add_argument('--amp-opt-level', type=str, default='O1', choices=['O0', 'O1', 'O2'], + help='mixed precision opt level, if O0, no amp is used') + parser.add_argument('--output', default='output', type=str, metavar='PATH', + help='root of output folder, the full path is // (default: output)') + parser.add_argument('--tag', help='tag of experiment') + parser.add_argument('--eval', action='store_true', help='Perform evaluation only') + parser.add_argument('--throughput', action='store_true', help='Test throughput only') + + # distributed training + parser.add_argument("--local_rank", type=int, required=True, help='local rank for DistributedDataParallel') + + args = parser.parse_args() + + config = get_config(args) + + return args, config + + +def main(config): + dataset_train, dataset_val, data_loader_train, data_loader_val, mixup_fn = build_loader(config, logger, is_pretrain=False) + + logger.info(f"Creating model:{config.MODEL.TYPE}/{config.MODEL.NAME}") + model = build_model(config, is_pretrain=False) + model.cuda() + logger.info(str(model)) + + optimizer = build_optimizer(config, model, logger, is_pretrain=False) + if config.AMP_OPT_LEVEL != "O0": + model, optimizer = amp.initialize(model, optimizer, opt_level=config.AMP_OPT_LEVEL) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.LOCAL_RANK], broadcast_buffers=False) + model_without_ddp = model.module + + n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + logger.info(f"number of params: {n_parameters}") + if hasattr(model_without_ddp, 'flops'): + flops = model_without_ddp.flops() + logger.info(f"number of GFLOPs: {flops / 1e9}") + + lr_scheduler = build_scheduler(config, optimizer, len(data_loader_train)) + + if config.AUG.MIXUP > 0.: + # smoothing is handled with mixup label transform + criterion = SoftTargetCrossEntropy() + elif config.MODEL.LABEL_SMOOTHING > 0.: + criterion = LabelSmoothingCrossEntropy(smoothing=config.MODEL.LABEL_SMOOTHING) + else: + criterion = torch.nn.CrossEntropyLoss() + + max_accuracy = 0.0 + + if config.TRAIN.AUTO_RESUME: + resume_file = auto_resume_helper(config.OUTPUT, logger) + if resume_file: + if config.MODEL.RESUME: + logger.warning(f"auto-resume changing resume file from {config.MODEL.RESUME} to {resume_file}") + config.defrost() + config.MODEL.RESUME = resume_file + config.freeze() + logger.info(f'auto resuming from {resume_file}') + else: + logger.info(f'no checkpoint found in {config.OUTPUT}, ignoring auto resume') + + if config.MODEL.RESUME: + max_accuracy = load_checkpoint(config, model_without_ddp, optimizer, lr_scheduler, logger) + acc1, acc5, loss = validate(config, data_loader_val, model) + logger.info(f"Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%") + if config.EVAL_MODE: + return + elif config.PRETRAINED: + load_pretrained(config, model_without_ddp, logger) + + if config.THROUGHPUT_MODE: + throughput(data_loader_val, model, logger) + return + + logger.info("Start training") + start_time = time.time() + for epoch in range(config.TRAIN.START_EPOCH, config.TRAIN.EPOCHS): + data_loader_train.sampler.set_epoch(epoch) + + train_one_epoch(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler) + if dist.get_rank() == 0 and (epoch % config.SAVE_FREQ == 0 or epoch == (config.TRAIN.EPOCHS - 1)): + save_checkpoint(config, epoch, model_without_ddp, max_accuracy, optimizer, lr_scheduler, logger) + + acc1, acc5, loss = validate(config, data_loader_val, model) + logger.info(f"Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%") + max_accuracy = max(max_accuracy, acc1) + logger.info(f'Max accuracy: {max_accuracy:.2f}%') + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + logger.info('Training time {}'.format(total_time_str)) + + +def train_one_epoch(config, model, criterion, data_loader, optimizer, epoch, mixup_fn, lr_scheduler): + model.train() + optimizer.zero_grad() + + logger.info(f'Current learning rate for different parameter groups: {[it["lr"] for it in optimizer.param_groups]}') + + num_steps = len(data_loader) + batch_time = AverageMeter() + loss_meter = AverageMeter() + norm_meter = AverageMeter() + + start = time.time() + end = time.time() + for idx, (samples, targets) in enumerate(data_loader): + samples = samples.cuda(non_blocking=True) + targets = targets.cuda(non_blocking=True) + + if mixup_fn is not None: + samples, targets = mixup_fn(samples, targets) + + outputs = model(samples) + + if config.TRAIN.ACCUMULATION_STEPS > 1: + loss = criterion(outputs, targets) + loss = loss / config.TRAIN.ACCUMULATION_STEPS + if config.AMP_OPT_LEVEL != "O0": + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + if config.TRAIN.CLIP_GRAD: + grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.TRAIN.CLIP_GRAD) + else: + grad_norm = get_grad_norm(amp.master_params(optimizer)) + else: + loss.backward() + if config.TRAIN.CLIP_GRAD: + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD) + else: + grad_norm = get_grad_norm(model.parameters()) + if (idx + 1) % config.TRAIN.ACCUMULATION_STEPS == 0: + optimizer.step() + optimizer.zero_grad() + lr_scheduler.step_update(epoch * num_steps + idx) + else: + loss = criterion(outputs, targets) + optimizer.zero_grad() + if config.AMP_OPT_LEVEL != "O0": + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + if config.TRAIN.CLIP_GRAD: + grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.TRAIN.CLIP_GRAD) + else: + grad_norm = get_grad_norm(amp.master_params(optimizer)) + else: + loss.backward() + if config.TRAIN.CLIP_GRAD: + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD) + else: + grad_norm = get_grad_norm(model.parameters()) + optimizer.step() + lr_scheduler.step_update(epoch * num_steps + idx) + + torch.cuda.synchronize() + + loss_meter.update(loss.item(), targets.size(0)) + norm_meter.update(grad_norm) + batch_time.update(time.time() - end) + end = time.time() + + if idx % config.PRINT_FREQ == 0: + lr = optimizer.param_groups[-1]['lr'] + memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0) + etas = batch_time.avg * (num_steps - idx) + logger.info( + f'Train: [{epoch}/{config.TRAIN.EPOCHS}][{idx}/{num_steps}]\t' + f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.6f}\t' + f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t' + f'loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t' + f'grad_norm {norm_meter.val:.4f} ({norm_meter.avg:.4f})\t' + f'mem {memory_used:.0f}MB') + epoch_time = time.time() - start + logger.info(f"EPOCH {epoch} training takes {datetime.timedelta(seconds=int(epoch_time))}") + + +@torch.no_grad() +def validate(config, data_loader, model): + criterion = torch.nn.CrossEntropyLoss() + model.eval() + + batch_time = AverageMeter() + loss_meter = AverageMeter() + acc1_meter = AverageMeter() + acc5_meter = AverageMeter() + + end = time.time() + for idx, (images, target) in enumerate(data_loader): + images = images.cuda(non_blocking=True) + target = target.cuda(non_blocking=True) + + # compute output + output = model(images) + + # measure accuracy and record loss + loss = criterion(output, target) + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + + acc1 = reduce_tensor(acc1) + acc5 = reduce_tensor(acc5) + loss = reduce_tensor(loss) + + loss_meter.update(loss.item(), target.size(0)) + acc1_meter.update(acc1.item(), target.size(0)) + acc5_meter.update(acc5.item(), target.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if idx % config.PRINT_FREQ == 0: + memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0) + logger.info( + f'Test: [{idx}/{len(data_loader)}]\t' + f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' + f'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t' + f'Acc@1 {acc1_meter.val:.3f} ({acc1_meter.avg:.3f})\t' + f'Acc@5 {acc5_meter.val:.3f} ({acc5_meter.avg:.3f})\t' + f'Mem {memory_used:.0f}MB') + logger.info(f' * Acc@1 {acc1_meter.avg:.3f} Acc@5 {acc5_meter.avg:.3f}') + return acc1_meter.avg, acc5_meter.avg, loss_meter.avg + + +@torch.no_grad() +def throughput(data_loader, model, logger): + model.eval() + + for idx, (images, _) in enumerate(data_loader): + images = images.cuda(non_blocking=True) + batch_size = images.shape[0] + for i in range(50): + model(images) + torch.cuda.synchronize() + logger.info(f"throughput averaged with 30 times") + tic1 = time.time() + for i in range(30): + model(images) + torch.cuda.synchronize() + tic2 = time.time() + logger.info(f"batch_size {batch_size} throughput {30 * batch_size / (tic2 - tic1)}") + return + + +if __name__ == '__main__': + _, config = parse_option() + + if config.AMP_OPT_LEVEL != "O0": + assert amp is not None, "amp not installed!" + + if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: + rank = int(os.environ["RANK"]) + world_size = int(os.environ['WORLD_SIZE']) + print(f"RANK and WORLD_SIZE in environ: {rank}/{world_size}") + else: + rank = -1 + world_size = -1 + torch.cuda.set_device(config.LOCAL_RANK) + torch.distributed.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank) + torch.distributed.barrier() + + seed = config.SEED + dist.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + cudnn.benchmark = True + + # linear scale the learning rate according to total batch size, may not be optimal + linear_scaled_lr = config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0 + linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0 + linear_scaled_min_lr = config.TRAIN.MIN_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0 + # gradient accumulation also need to scale the learning rate + if config.TRAIN.ACCUMULATION_STEPS > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS + linear_scaled_warmup_lr = linear_scaled_warmup_lr * config.TRAIN.ACCUMULATION_STEPS + linear_scaled_min_lr = linear_scaled_min_lr * config.TRAIN.ACCUMULATION_STEPS + config.defrost() + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_LR = linear_scaled_warmup_lr + config.TRAIN.MIN_LR = linear_scaled_min_lr + config.freeze() + + os.makedirs(config.OUTPUT, exist_ok=True) + logger = create_logger(output_dir=config.OUTPUT, dist_rank=dist.get_rank(), name=f"{config.MODEL.NAME}") + + if dist.get_rank() == 0: + path = os.path.join(config.OUTPUT, "config.json") + with open(path, "w") as f: + f.write(config.dump()) + logger.info(f"Full config saved to {path}") + + # print config + logger.info(config.dump()) + + main(config) diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/main_simmim.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/main_simmim.py new file mode 100644 index 0000000000000000000000000000000000000000..c87dfd993ec2726f05e7e86325cac158c3326db0 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/main_simmim.py @@ -0,0 +1,283 @@ +# -------------------------------------------------------- +# SimMIM +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu +# Modified by Zhenda Xie +# -------------------------------------------------------- + + +"""_summary_ + +Test code: + +simmim_pretrain__vit_base__img224__800ep.yaml + +python -m torch.distributed.launch \ + --nproc_per_node 3 \ + main_simmim.py \ + --cfg ./configs/vit_base__800ep/simmim_pretrain__vit_base__img224__800ep.yaml \ + --data-path /data/imagenet_ILSVRC/ILSVRC/Data/CLS-LOC/train \ + --batch-size 128 \ + --output ./output \ + --tag test_run \ + --amp-opt-level O0 +""" + +import os +import time +import argparse +import datetime +import numpy as np + +import torch +import torch.backends.cudnn as cudnn +import torch.distributed as dist +from timm.utils import AverageMeter, accuracy + +from config import get_config +from models import build_model +from data import build_loader +from lr_scheduler import build_scheduler +from optimizer import build_optimizer +from logger import create_logger +from utils import load_checkpoint, save_checkpoint, get_grad_norm, auto_resume_helper + + +from torch.utils.tensorboard import SummaryWriter + +try: + # noinspection PyUnresolvedReferences + # from apex import amp + import torch.cuda.amp as amp + from torch.cuda.amp import autocast as autocast +except ImportError: + amp = None + +# fixme: fix cpu number here! +os.environ["OMP_NUM_THREADS"] = "64" + + +def parse_option(): + parser = argparse.ArgumentParser('SimMIM pre-training script', add_help=False) + parser.add_argument('--cfg', type=str, required=True, metavar="FILE", help='path to config file', ) + parser.add_argument( + "--opts", + help="Modify config options by adding 'KEY VALUE' pairs. ", + default=None, + nargs='+', + ) + + # easy config modification + parser.add_argument('--batch-size', type=int, help="batch size for single GPU") + parser.add_argument('--data-path', type=str, help='path to dataset') + parser.add_argument('--resume', help='resume from checkpoint') + parser.add_argument('--accumulation-steps', type=int, help="gradient accumulation steps") + parser.add_argument('--use-checkpoint', action='store_true', + help="whether to use gradient checkpointing to save memory") + parser.add_argument('--amp-opt-level', type=str, default='O1', choices=['O0', 'O1', 'O2'], + help='mixed precision opt level, if O0, no amp is used') + parser.add_argument('--output', default='output', type=str, metavar='PATH', + help='root of output folder, the full path is // (default: output)') + parser.add_argument('--tag', help='tag of experiment') + + # distributed training + parser.add_argument("--local-rank", type=int, required=True, help='local rank for DistributedDataParallel') + + # others + parser.add_argument('--load-weight', type=str, help='Path to init model weight (only applicable for vit model)') + + parser.add_argument('--log_dir', default='./runs', + help='path where to tensorboard log') + + args = parser.parse_args() + + config = get_config(args) + + return args, config + + +def main(config, args): + data_loader_train = build_loader(config, logger, is_pretrain=True) + + logger.info(f"Creating model:{config.MODEL.TYPE}/{config.MODEL.NAME}") + model = build_model(config, is_pretrain=True, load_weight=args.load_weight) + model.cuda() + logger.info(str(model)) + + optimizer = build_optimizer(config, model, logger, is_pretrain=True) + + # Modified: use scalar for AMP + assert config.AMP_OPT_LEVEL == "O1", "Only support amp opt level: O1!" + scaler = torch.cuda.amp.GradScaler() + + # if config.AMP_OPT_LEVEL != "O0": + # model, optimizer = amp.initialize(model, optimizer, opt_level=config.AMP_OPT_LEVEL) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.LOCAL_RANK], broadcast_buffers=False) + model_without_ddp = model.module + + n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + logger.info(f"number of params: {n_parameters}") + if hasattr(model_without_ddp, 'flops'): + flops = model_without_ddp.flops() + logger.info(f"number of GFLOPs: {flops / 1e9}") + + lr_scheduler = build_scheduler(config, optimizer, len(data_loader_train)) + + if config.TRAIN.AUTO_RESUME: + resume_file = auto_resume_helper(config.OUTPUT, logger) + if resume_file: + if config.MODEL.RESUME: + logger.warning(f"auto-resume changing resume file from {config.MODEL.RESUME} to {resume_file}") + config.defrost() + config.MODEL.RESUME = resume_file + config.freeze() + logger.info(f'auto resuming from {resume_file}') + else: + logger.info(f'no checkpoint found in {config.OUTPUT}, ignoring auto resume') + + if config.MODEL.RESUME: + load_checkpoint(config, model_without_ddp, optimizer, lr_scheduler, logger) + + + writer = SummaryWriter(log_dir=args.log_dir) + + logger.info("Start training") + start_time = time.time() + for epoch in range(config.TRAIN.START_EPOCH, config.TRAIN.EPOCHS): + data_loader_train.sampler.set_epoch(epoch) + + train_one_epoch(config, model, data_loader_train, optimizer, epoch, lr_scheduler, scaler, writer) + if dist.get_rank() == 0 and (epoch % config.SAVE_FREQ == 0 or epoch == (config.TRAIN.EPOCHS - 1)): + save_checkpoint(config, epoch, model_without_ddp, 0., optimizer, lr_scheduler, logger) + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + logger.info('Training time {}'.format(total_time_str)) + + +def train_one_epoch(config, model, data_loader, optimizer, epoch, lr_scheduler, scaler, writer): + model.train() + optimizer.zero_grad() + + num_steps = len(data_loader) + batch_time = AverageMeter() + loss_meter = AverageMeter() + norm_meter = AverageMeter() + + start = time.time() + end = time.time() + for idx, (img, mask, _) in enumerate(data_loader): + img = img.cuda(non_blocking=True) + mask = mask.cuda(non_blocking=True) + + # Modified: Use autocast + with autocast(): + loss = model(img, mask) + + optimizer.zero_grad() + if config.TRAIN.ACCUMULATION_STEPS > 1: + loss = loss / config.TRAIN.ACCUMULATION_STEPS + scaler.scale(loss).backward() + + if config.TRAIN.CLIP_GRAD: + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD) + else: + grad_norm = get_grad_norm(model.parameters()) + if (idx + 1) % config.TRAIN.ACCUMULATION_STEPS == 0: + # 将梯度值缩放回原尺度后,优化器进行一步优化 + scaler.step(optimizer) + + # 更新scalar的缩放信息 + scaler.update() + + # optimizer.step() + # optimizer.zero_grad() + lr_scheduler.step_update(epoch * num_steps + idx) + else: + scaler.scale(loss).backward() + if config.TRAIN.CLIP_GRAD: + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD) + else: + grad_norm = get_grad_norm(model.parameters()) + scaler.step(optimizer) + scaler.update() + lr_scheduler.step_update(epoch * num_steps + idx) + + torch.cuda.synchronize() + + loss_meter.update(loss.item(), img.size(0)) + norm_meter.update(grad_norm) + batch_time.update(time.time() - end) + end = time.time() + + if idx % config.PRINT_FREQ == 0: + lr = optimizer.param_groups[0]['lr'] + memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0) + etas = batch_time.avg * (num_steps - idx) + logger.info( + f'Train: [{epoch}/{config.TRAIN.EPOCHS}][{idx}/{num_steps}]\t' + f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.6f}\t' + f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t' + f'loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t' + f'grad_norm {norm_meter.val:.4f} ({norm_meter.avg:.4f})\t' + f'mem {memory_used:.0f}MB') + writer.add_scalar('loss', loss_meter.val, global_step=epoch*len(data_loader)+idx) + + epoch_time = time.time() - start + logger.info(f"EPOCH {epoch} training takes {datetime.timedelta(seconds=int(epoch_time))}") + + # writer.add_scalar('loss', loss_meter.val, global_step=epoch) + + +if __name__ == '__main__': + + args, config = parse_option() + + if config.AMP_OPT_LEVEL != "O0": + assert amp is not None, "amp not installed!" + + if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: + rank = int(os.environ["RANK"]) + world_size = int(os.environ['WORLD_SIZE']) + print(f"RANK and WORLD_SIZE in environ: {rank}/{world_size}") + else: + rank = -1 + world_size = -1 + torch.cuda.set_device(config.LOCAL_RANK) + torch.distributed.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank) + torch.distributed.barrier() + + seed = config.SEED + dist.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + cudnn.benchmark = True + + # linear scale the learning rate according to total batch size, may not be optimal + linear_scaled_lr = config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0 + linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0 + linear_scaled_min_lr = config.TRAIN.MIN_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0 + # gradient accumulation also need to scale the learning rate + if config.TRAIN.ACCUMULATION_STEPS > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS + linear_scaled_warmup_lr = linear_scaled_warmup_lr * config.TRAIN.ACCUMULATION_STEPS + linear_scaled_min_lr = linear_scaled_min_lr * config.TRAIN.ACCUMULATION_STEPS + config.defrost() + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_LR = linear_scaled_warmup_lr + config.TRAIN.MIN_LR = linear_scaled_min_lr + config.freeze() + + os.makedirs(config.OUTPUT, exist_ok=True) + logger = create_logger(output_dir=config.OUTPUT, dist_rank=dist.get_rank(), name=f"{config.MODEL.NAME}") + + if dist.get_rank() == 0: + path = os.path.join(config.OUTPUT, "config.json") + with open(path, "w") as f: + f.write(config.dump()) + logger.info(f"Full config saved to {path}") + + # print config + logger.info(config.dump()) + + main(config, args) diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/__init__.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2d9c65e39f0fb592bd09ebd5eaba754c5a8f192e --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/__init__.py @@ -0,0 +1 @@ +from .build import build_model \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/build.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/build.py new file mode 100644 index 0000000000000000000000000000000000000000..e3a2966d807a8eea75fa7984813582e24ee56f38 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/build.py @@ -0,0 +1,26 @@ +# -------------------------------------------------------- +# SimMIM +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu +# Modified by Zhenda Xie +# -------------------------------------------------------- + +from .swin_transformer import build_swin +from .vision_transformer import build_vit, build_vit_mod +from .simmim import build_simmim + + +def build_model(config, is_pretrain=True, load_weight=None): + if is_pretrain: + model = build_simmim(config, load_weight) + else: + model_type = config.MODEL.TYPE + if model_type == 'swin': + model = build_swin(config) + elif model_type == 'vit': + model = build_vit_mod(config) + else: + raise NotImplementedError(f"Unknown fine-tune model: {model_type}") + + return model diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/simmim.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/simmim.py new file mode 100644 index 0000000000000000000000000000000000000000..ebf78542156de089e9ed447bea3cbb84dff78d6b --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/simmim.py @@ -0,0 +1,241 @@ +# -------------------------------------------------------- +# SimMIM +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Zhenda Xie +# -------------------------------------------------------- + +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F +from timm.models.layers import trunc_normal_ + +from .swin_transformer import SwinTransformer +# from .vision_transformer import VisionTransformer +# from timm.models.vision_transformer import VisionTransformer +from .vit_simple import VisionTransformer + + +class SwinTransformerForSimMIM(SwinTransformer): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + assert self.num_classes == 0 + + self.mask_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim)) + trunc_normal_(self.mask_token, mean=0., std=.02) + + def forward(self, x, mask): + x = self.patch_embed(x) + + assert mask is not None + B, L, _ = x.shape + + mask_tokens = self.mask_token.expand(B, L, -1) + w = mask.flatten(1).unsqueeze(-1).type_as(mask_tokens) + x = x * (1. - w) + mask_tokens * w + + if self.ape: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + + for layer in self.layers: + x = layer(x) + x = self.norm(x) + + x = x.transpose(1, 2) + B, C, L = x.shape + H = W = int(L ** 0.5) + x = x.reshape(B, C, H, W) + return x + + @torch.jit.ignore + def no_weight_decay(self): + return super().no_weight_decay() | {'mask_token'} + + +# class VisionTransformerForSimMIM(VisionTransformer): +# def __init__(self, **kwargs): +# super().__init__(**kwargs) + +# assert self.num_classes == 0 + +# self.mask_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim)) +# self._trunc_normal_(self.mask_token, std=.02) + +# def _trunc_normal_(self, tensor, mean=0., std=1.): +# trunc_normal_(tensor, mean=mean, std=std, a=-std, b=std) + +# def forward(self, x, mask): +# x = self.patch_embed(x) + +# assert mask is not None +# B, L, _ = x.shape + +# mask_token = self.mask_token.expand(B, L, -1) +# w = mask.flatten(1).unsqueeze(-1).type_as(mask_token) +# x = x * (1 - w) + mask_token * w + +# cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks +# x = torch.cat((cls_tokens, x), dim=1) + +# if self.pos_embed is not None: +# x = x + self.pos_embed +# x = self.pos_drop(x) + +# rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None +# for blk in self.blocks: +# x = blk(x, rel_pos_bias=rel_pos_bias) +# x = self.norm(x) + +# x = x[:, 1:] +# B, L, C = x.shape +# H = W = int(L ** 0.5) +# x = x.permute(0, 2, 1).reshape(B, C, H, W) +# return x + + +class VisionTransformerForSimMIM(VisionTransformer): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + assert self.num_classes == 0 + + self.mask_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim)) + self._trunc_normal_(self.mask_token, std=.02) + + def _trunc_normal_(self, tensor, mean=0., std=1.): + trunc_normal_(tensor, mean=mean, std=std, a=-std, b=std) + + def forward(self, x, mask): + x = self.patch_embed(x) + + assert mask is not None + B, L, _ = x.shape + + # Randomly mask some patches + mask_token = self.mask_token.expand(B, L, -1) + w = mask.flatten(1).unsqueeze(-1).type_as(mask_token) + x = x * (1 - w) + mask_token * w + + if self.pos_embed is not None: + x = self._pos_embed(x) + + x = self.blocks(x) + x = self.norm(x) + + x = x[:, 1:] + B, L, C = x.shape + H = W = int(L ** 0.5) + x = x.permute(0, 2, 1).reshape(B, C, H, W) + return x + + +class SimMIM(nn.Module): + def __init__(self, encoder, encoder_stride): + super().__init__() + self.encoder = encoder + self.encoder_stride = encoder_stride + + self.decoder = nn.Sequential( + nn.Conv2d( + in_channels=self.encoder.num_features, + out_channels=self.encoder_stride ** 2 * 3, kernel_size=1), + nn.PixelShuffle(self.encoder_stride), + ) + + self.in_chans = self.encoder.in_chans + self.patch_size = self.encoder.patch_size + + def forward(self, x, mask): + z = self.encoder(x, mask) + x_rec = self.decoder(z) + + mask = mask.repeat_interleave(self.patch_size, 1).repeat_interleave(self.patch_size, 2).unsqueeze(1).contiguous() + loss_recon = F.l1_loss(x, x_rec, reduction='none') + loss = (loss_recon * mask).sum() / (mask.sum() + 1e-5) / self.in_chans + return loss + + @torch.jit.ignore + def no_weight_decay(self): + if hasattr(self.encoder, 'no_weight_decay'): + return {'encoder.' + i for i in self.encoder.no_weight_decay()} + return {} + + @torch.jit.ignore + def no_weight_decay_keywords(self): + if hasattr(self.encoder, 'no_weight_decay_keywords'): + return {'encoder.' + i for i in self.encoder.no_weight_decay_keywords()} + return {} + + +def build_simmim(config, load_weight=None): + model_type = config.MODEL.TYPE + if model_type == 'swin': + encoder = SwinTransformerForSimMIM( + img_size=config.DATA.IMG_SIZE, + patch_size=config.MODEL.SWIN.PATCH_SIZE, + in_chans=config.MODEL.SWIN.IN_CHANS, + num_classes=0, + embed_dim=config.MODEL.SWIN.EMBED_DIM, + depths=config.MODEL.SWIN.DEPTHS, + num_heads=config.MODEL.SWIN.NUM_HEADS, + window_size=config.MODEL.SWIN.WINDOW_SIZE, + mlp_ratio=config.MODEL.SWIN.MLP_RATIO, + qkv_bias=config.MODEL.SWIN.QKV_BIAS, + qk_scale=config.MODEL.SWIN.QK_SCALE, + drop_rate=config.MODEL.DROP_RATE, + drop_path_rate=config.MODEL.DROP_PATH_RATE, + ape=config.MODEL.SWIN.APE, + patch_norm=config.MODEL.SWIN.PATCH_NORM, + use_checkpoint=config.TRAIN.USE_CHECKPOINT) + encoder_stride = 32 + elif model_type == 'vit': + # encoder = VisionTransformerForSimMIM( + # img_size=config.DATA.IMG_SIZE, + # patch_size=config.MODEL.VIT.PATCH_SIZE, + # in_chans=config.MODEL.VIT.IN_CHANS, + # num_classes=0, + # embed_dim=config.MODEL.VIT.EMBED_DIM, + # depth=config.MODEL.VIT.DEPTH, + # num_heads=config.MODEL.VIT.NUM_HEADS, + # mlp_ratio=config.MODEL.VIT.MLP_RATIO, + # qkv_bias=config.MODEL.VIT.QKV_BIAS, + # drop_rate=config.MODEL.DROP_RATE, + # drop_path_rate=config.MODEL.DROP_PATH_RATE, + # norm_layer=partial(nn.LayerNorm, eps=1e-6), + # init_values=config.MODEL.VIT.INIT_VALUES, + # use_abs_pos_emb=config.MODEL.VIT.USE_APE, + # use_rel_pos_bias=config.MODEL.VIT.USE_RPB, + # use_shared_rel_pos_bias=config.MODEL.VIT.USE_SHARED_RPB, + # use_mean_pooling=config.MODEL.VIT.USE_MEAN_POOLING) + print('Ignored all config about ViT!') + encoder = VisionTransformerForSimMIM( + img_size=224, + patch_size=16, + in_chans=3, + num_classes=0, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4., + qkv_bias=True, + drop_rate=0., + drop_path_rate=0., + norm_layer=None) + encoder_stride = 16 + + # load pretrained weight + if load_weight: + model_weights = torch.load(load_weight) + encoder.load_state_dict(model_weights, strict=False) + print('loaded from pretrained weight') + + else: + raise NotImplementedError(f"Unknown pre-train model: {model_type}") + + model = SimMIM(encoder=encoder, encoder_stride=encoder_stride) + + return model diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/swin_transformer.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/swin_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..56783400cad7bb32ec5b0f116f8f187bb9f91b67 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/swin_transformer.py @@ -0,0 +1,612 @@ +# -------------------------------------------------------- +# SimMIM +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu +# Modified by Zhenda Xie +# -------------------------------------------------------- + +import torch +import torch.nn as nn +import torch.utils.checkpoint as checkpoint +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + r""" Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """ + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + def extra_repr(self) -> str: + return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}' + + def flops(self, N): + # calculate flops for 1 window with token length of N + flops = 0 + # qkv = self.qkv(x) + flops += N * self.dim * 3 * self.dim + # attn = (q @ k.transpose(-2, -1)) + flops += self.num_heads * N * (self.dim // self.num_heads) * N + # x = (attn @ v) + flops += self.num_heads * N * N * (self.dim // self.num_heads) + # x = self.proj(x) + flops += N * self.dim * self.dim + return flops + + +class SwinTransformerBlock(nn.Module): + r""" Swin Transformer Block. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resulotion. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., + act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + if min(self.input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = 0 + self.window_size = min(self.input_resolution) + assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, + qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + if self.shift_size > 0: + # calculate attention mask for SW-MSA + H, W = self.input_resolution + img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1 + h_slices = (slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)) + w_slices = (slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) + else: + attn_mask = None + + self.register_buffer("attn_mask", attn_mask) + + def forward(self, x): + H, W = self.input_resolution + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + else: + shifted_x = x + + # partition windows + x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C + x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, mask=self.attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) + else: + x = shifted_x + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \ + f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}" + + def flops(self): + flops = 0 + H, W = self.input_resolution + # norm1 + flops += self.dim * H * W + # W-MSA/SW-MSA + nW = H * W / self.window_size / self.window_size + flops += nW * self.attn.flops(self.window_size * self.window_size) + # mlp + flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio + # norm2 + flops += self.dim * H * W + return flops + + +class PatchMerging(nn.Module): + r""" Patch Merging Layer. + + Args: + input_resolution (tuple[int]): Resolution of input feature. + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x): + """ + x: B, H*W, C + """ + H, W = self.input_resolution + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even." + + x = x.view(B, H, W, C) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + def extra_repr(self) -> str: + return f"input_resolution={self.input_resolution}, dim={self.dim}" + + def flops(self): + H, W = self.input_resolution + flops = H * W * self.dim + flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim + return flops + + +class BasicLayer(nn.Module): + """ A basic Swin Transformer layer for one stage. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__(self, dim, input_resolution, depth, num_heads, window_size, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False): + + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList([ + SwinTransformerBlock(dim=dim, input_resolution=input_resolution, + num_heads=num_heads, window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop, attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer) + for i in range(depth)]) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x): + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + if self.downsample is not None: + x = self.downsample(x) + return x + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" + + def flops(self): + flops = 0 + for blk in self.blocks: + flops += blk.flops() + if self.downsample is not None: + flops += self.downsample.flops() + return flops + + +class PatchEmbed(nn.Module): + r""" Image to Patch Embedding + + Args: + img_size (int): Image size. Default: 224. + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]] + self.img_size = img_size + self.patch_size = patch_size + self.patches_resolution = patches_resolution + self.num_patches = patches_resolution[0] * patches_resolution[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C + if self.norm is not None: + x = self.norm(x) + return x + + def flops(self): + Ho, Wo = self.patches_resolution + flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops + + +class SwinTransformer(nn.Module): + r""" Swin Transformer + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + + Args: + img_size (int | tuple(int)): Input image size. Default 224 + patch_size (int | tuple(int)): Patch size. Default: 4 + in_chans (int): Number of input image channels. Default: 3 + num_classes (int): Number of classes for classification head. Default: 1000 + embed_dim (int): Patch embedding dimension. Default: 96 + depths (tuple(int)): Depth of each Swin Transformer layer. + num_heads (tuple(int)): Number of attention heads in different layers. + window_size (int): Window size. Default: 7 + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None + drop_rate (float): Dropout rate. Default: 0 + attn_drop_rate (float): Attention dropout rate. Default: 0 + drop_path_rate (float): Stochastic depth rate. Default: 0.1 + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False + patch_norm (bool): If True, add normalization after patch embedding. Default: True + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False + """ + + def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, + embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], + window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, + norm_layer=nn.LayerNorm, ape=False, patch_norm=True, + use_checkpoint=False, **kwargs): + super().__init__() + + self.img_size = img_size + self.patch_size = patch_size + self.in_chans = in_chans + + self.num_classes = num_classes + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) + self.mlp_ratio = mlp_ratio + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None) + num_patches = self.patch_embed.num_patches + patches_resolution = self.patch_embed.patches_resolution + self.patches_resolution = patches_resolution + + # absolute position embedding + if self.ape: + self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) + trunc_normal_(self.absolute_pos_embed, std=.02) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer), + input_resolution=(patches_resolution[0] // (2 ** i_layer), + patches_resolution[1] // (2 ** i_layer)), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=self.mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + use_checkpoint=use_checkpoint) + self.layers.append(layer) + + self.norm = norm_layer(self.num_features) + self.avgpool = nn.AdaptiveAvgPool1d(1) + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'absolute_pos_embed'} + + @torch.jit.ignore + def no_weight_decay_keywords(self): + return {'relative_position_bias_table'} + + def forward_features(self, x): + x = self.patch_embed(x) + if self.ape: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + + for layer in self.layers: + x = layer(x) + + x = self.norm(x) # B L C + x = self.avgpool(x.transpose(1, 2)) # B C 1 + x = torch.flatten(x, 1) + return x + + def forward(self, x): + x = self.forward_features(x) + x = self.head(x) + return x + + def flops(self): + flops = 0 + flops += self.patch_embed.flops() + for i, layer in enumerate(self.layers): + flops += layer.flops() + flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers) + flops += self.num_features * self.num_classes + return flops + + +def build_swin(config): + model = SwinTransformer( + img_size=config.DATA.IMG_SIZE, + patch_size=config.MODEL.SWIN.PATCH_SIZE, + in_chans=config.MODEL.SWIN.IN_CHANS, + num_classes=config.MODEL.NUM_CLASSES, + embed_dim=config.MODEL.SWIN.EMBED_DIM, + depths=config.MODEL.SWIN.DEPTHS, + num_heads=config.MODEL.SWIN.NUM_HEADS, + window_size=config.MODEL.SWIN.WINDOW_SIZE, + mlp_ratio=config.MODEL.SWIN.MLP_RATIO, + qkv_bias=config.MODEL.SWIN.QKV_BIAS, + qk_scale=config.MODEL.SWIN.QK_SCALE, + drop_rate=config.MODEL.DROP_RATE, + drop_path_rate=config.MODEL.DROP_PATH_RATE, + ape=config.MODEL.SWIN.APE, + patch_norm=config.MODEL.SWIN.PATCH_NORM, + use_checkpoint=config.TRAIN.USE_CHECKPOINT) + + return model \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/vision_transformer.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/vision_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..c161245c2485eec92e1c1ed3af2ac6c5b5c6c166 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/vision_transformer.py @@ -0,0 +1,377 @@ +# -------------------------------------------------------- +# SimMIM +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Based on BEIT code bases (https://github.com/microsoft/unilm/tree/master/beit) +# Written by Yutong Lin, Zhenda Xie +# -------------------------------------------------------- + +import math +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + # x = self.drop(x) + # comment out this for the orignal BERT implement + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__( + self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., + proj_drop=0., window_size=None, attn_head_dim=None): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) + else: + self.q_bias = None + self.v_bias = None + + if window_size: + self.window_size = window_size + # cls to token & token to cls & cls to cls + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = \ + torch.zeros(size=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + else: + self.window_size = None + self.relative_position_bias_table = None + self.relative_position_index = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, rel_pos_bias=None): + B, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + if self.relative_position_bias_table is not None: + relative_position_bias = \ + self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if rel_pos_bias is not None: + attn = attn + rel_pos_bias + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm, + window_size=None, attn_head_dim=None): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + if init_values is not None: + self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True) + self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x, rel_pos_bias=None): + if self.gamma_1 is None: + x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x, **kwargs): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).transpose(1, 2) + return x + + +class RelativePositionBias(nn.Module): + + def __init__(self, window_size, num_heads): + super().__init__() + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = \ + torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + + def forward(self): + relative_position_bias = \ + self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH + return relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + + +class VisionTransformer(nn.Module): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, + use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, + use_mean_pooling=True, init_scale=0.001): + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim + self.patch_size = patch_size + self.in_chans = in_chans + + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + if use_abs_pos_emb: + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + else: + self.pos_embed = None + self.pos_drop = nn.Dropout(p=drop_rate) + + if use_shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads) + else: + self.rel_pos_bias = None + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.use_rel_pos_bias = use_rel_pos_bias + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None) + for i in range(depth)]) + self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim) + self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + if self.pos_embed is not None: + self._trunc_normal_(self.pos_embed, std=.02) + self._trunc_normal_(self.cls_token, std=.02) + if num_classes > 0: + self._trunc_normal_(self.head.weight, std=.02) + self.apply(self._init_weights) + self.fix_init_weight() + + if num_classes > 0: + self.head.weight.data.mul_(init_scale) + self.head.bias.data.mul_(init_scale) + + def _trunc_normal_(self, tensor, mean=0., std=1.): + trunc_normal_(tensor, mean=mean, std=std) + + def fix_init_weight(self): + def rescale(param, layer_id): + param.div_(math.sqrt(2.0 * layer_id)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight.data, layer_id + 1) + rescale(layer.mlp.fc2.weight.data, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + self._trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + self._trunc_normal_(m.weight, std=.02) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def get_num_layers(self): + return len(self.blocks) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.patch_embed(x) + batch_size, seq_len, _ = x.size() + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for blk in self.blocks: + x = blk(x, rel_pos_bias=rel_pos_bias) + + x = self.norm(x) + if self.fc_norm is not None: + t = x[:, 1:, :] + return self.fc_norm(t.mean(1)) + else: + return x[:, 0] + + def forward(self, x): + x = self.forward_features(x) + x = self.head(x) + return x + + +def build_vit(config): + model = VisionTransformer( + img_size=config.DATA.IMG_SIZE, + patch_size=config.MODEL.VIT.PATCH_SIZE, + in_chans=config.MODEL.VIT.IN_CHANS, + num_classes=config.MODEL.NUM_CLASSES, + embed_dim=config.MODEL.VIT.EMBED_DIM, + depth=config.MODEL.VIT.DEPTH, + num_heads=config.MODEL.VIT.NUM_HEADS, + mlp_ratio=config.MODEL.VIT.MLP_RATIO, + qkv_bias=config.MODEL.VIT.QKV_BIAS, + drop_rate=config.MODEL.DROP_RATE, + drop_path_rate=config.MODEL.DROP_PATH_RATE, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + init_values=config.MODEL.VIT.INIT_VALUES, + use_abs_pos_emb=config.MODEL.VIT.USE_APE, + use_rel_pos_bias=config.MODEL.VIT.USE_RPB, + use_shared_rel_pos_bias=config.MODEL.VIT.USE_SHARED_RPB, + use_mean_pooling=config.MODEL.VIT.USE_MEAN_POOLING) + + return model + +def build_vit_mod(config): + model = VisionTransformer( + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4., + qkv_bias=True, + drop_rate=0., + drop_path_rate=0., + norm_layer=partial(nn.LayerNorm, eps=1e-6), + init_values=None, + use_abs_pos_emb=True, + use_rel_pos_bias=False, + use_shared_rel_pos_bias=False, + use_mean_pooling=False) + + return model \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/vit_simple.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/vit_simple.py new file mode 100644 index 0000000000000000000000000000000000000000..61b11eb7f5299e4431a41daf43d29049133c6af9 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/models/vit_simple.py @@ -0,0 +1,360 @@ +""" +Here defines the model. +""" + +import torch +import torch.nn as nn +from functools import partial +from timm.models.layers import DropPath, to_2tuple + + +class PatchEmbed(nn.Module): + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + """Image to patch embedding + Input dimension: [B, c, h, w] + + Notes: + B : Batches + c : Input channels + h : Input heights + w : Input widths + D : Patch dimension + + Args: + img_size (int, optional): Width and height of the input image. Defaults to 224. + patch_size (int, optional): Width and height of the patch. Defaults to 16. + in_chans (int, optional): Input image channel. Defaults to 3. + embed_dim (int, optional): Patch embedding dimension. Defaults to 768. + """ + super().__init__() + + img_size = to_2tuple(img_size) # (img_size, img_size) + patch_size = to_2tuple(patch_size) # (patch_size, patch_size) + + self.num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + + # Use CNN to split patches + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + """Divide image into batches + + Args: + x : [B, c, h, w] + + Returns: + _type_: _description_ + """ + # Divide of the input image into patches + # [B, c, h, w] -> [B, D, h//patch_size, w//patch_size] + # eg: [10, 3, 224, 224] -> [10, 768, 14, 14] + x = self.proj(x) + + # Flatten + # [B, D, h//patch_size, w//patch_size] -> [B, D, h//patch_size*w//patch_size] + # eg: [10, 768, 14, 14] -> [10, 768, 196] + x = x.flatten(2) + + # Transpose + # [B, D, h//patch_size*w//patch_size] -> [B, h//patch_size*w//patch_size, D] + # eg: [10, 768, 196] -> [10, 196, 768] + x = x.transpose(1, 2) + return x + + +class FFN(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + """Feed forward network (with one hidden layer) + + Args: + in_features: Number of input features + hidden_features (optional): Number of features in the hidden layer. Defaults to None. + out_features (optional): Number of output features. Defaults to None. + act_layer (optional): The activation function. Defaults to nn.GELU. + drop (optional): Dropout percentage. Defaults to 0.. + """ + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): + """Multi-head self attention + + Args: + dim : Patch dimension + num_heads (optional): Number of heads. Defaults to 8. + qkv_bias (bool, optional): Whether we add bias for each output. Defaults to False. + attn_drop (optional): Drop out for the output of softmax(q*k^T). Defaults to 0.. + proj_drop (optional): Drop out for the final MLP. Defaults to 0.. + """ + super().__init__() + assert dim % num_heads == 0, 'dim should be divisible by num_heads' + + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + + # The linear layer used to divide qkv from the input for self attention + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + """Forward function + + Notes: + B: Batch + N: Patch number + D: Patch dimension (embedding dimension) (should be divisible by H) + H: Number of heads + + Args: + x : [B, N, D] + + Returns: + x : [B, N, D] + """ + + B, N, D = x.shape + + # Generate qkv based using a same FFN + # [B, N, D] -> [B, N, 3D] -> [B, N, 3, H, D/H] -> [3, B, H, N, D/H] + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, D // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # [B, H, N, D/H] + + # Here we will generate a correlation matrix by applying q * k^T + # [B, H, N, D/H] @ [B, H, D/H, N] -> [B, H, N, N] -> normalize + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) # Default not been used + + # Weighted sum + # [B, H, N, N] @ [B, H, N, D/H] -> [B, H, N, D/H] -> aggregate to [B, N, D] + x = (attn @ v).transpose(1, 2).reshape(B, N, D) + + # Use MLP to stable the training process + x = self.proj(x) + x = self.proj_drop(x) # Default not been used + + # x: [B, N, D] + return x + + +class Block(nn.Module): + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + """Transformer encoder block + + Args: + dim : Patch dimension. + num_heads : Number of heads in the attention model. + mlp_ratio (optional): Hidden layer dimension (times of the patch dimenstion). Defaults to 4.. + qkv_bias (bool, optional): Whether we add bias for each output. Defaults to False. + drop (optional): Drop out for the output of final MLP in the attention model. Defaults to 0.. + attn_drop (optional): Drop out for the output of softmax(q*k^T) in the attention model. Defaults to 0.. + drop_path (optional): Drop path. Defaults to 0.. + act_layer (optional): Activation layer. Defaults to nn.GELU. + norm_layer (optional): Normalization layer. Defaults to nn.LayerNorm. + """ + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + + # Use drop path if selected + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = FFN(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + """Here defined the block structure: + + x + |--------------- + Norm1 | + Attention | (skip connection) + Drop path | + | <-------------- + |--------------- + Norm2 | + FFN | (skip connection) + Drop path | + |<--------------- + x + + Args: + x : [B, N, D] + + Returns: + x : [B, N, D] + """ + x = x + self.drop_path(self.attn(self.norm1(x))) # skip connection + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class VisionTransformer(nn.Module): + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4., + qkv_bias=True, + pre_norm=False, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + embed_layer=PatchEmbed, + norm_layer=None, + act_layer=None, + block_fn=Block, + ): + """Vision Transformer (SimMIM compat version) + + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` - + https://arxiv.org/abs/2010.11929 + + Ref: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/vision_transformer.py + + Args: + img_size (int, tuple, optional): Input image size. Defaults to [B, 3, 224, 224)]. + patch_size (int, tuple, optional): Patch size. Defaults to [16, 16]. + in_chans (int, optional): Number of input channels. Defaults to 3. + num_classes (int, optional): Number of classes for classification head. Defaults to 1000. + embed_dim (int, optional): Embedding dimension (patch dimension). Defaults to 768. + depth (int, optional): Depth of transformer encoder blocks. Defaults to 12. + num_heads (int, optional): number of attention heads. Defaults to 12. + mlp_ratio (int, optional): Ratio of mlp hidden dim to embedding dim. Defaults to 4.. + qkv_bias (bool, optional): Enable bias for qkv if True. Defaults to True. + pre_norm (bool, optional): Whether to normalize before encoder blocks. Defaults to [16, 16]. + drop_rate (float, optional): Dropout rate in attention model. Defaults to 0. + attn_drop_rate (float, optional): Attention dropout rate in attention model. Defaults to 0.. + drop_path_rate (float, optional): Stochastic depth rate. Defaults to 0.. + embed_layer (nn.Module, optional): Patch embedding layer. Defaults to PatchEmbed. + norm_layer (nn.Module, optional): Customized normalization layer. Defaults to None. + act_layer (nn.Module, optional): Customized MLP activation layer. Defaults to None. + block_fn (nn.Module, optional): Encoder block. Defaults to Block. + """ + super().__init__() + + # Setup normalization and activation function + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) # Layer normalization with default eps=1e-6 + act_layer = act_layer or nn.GELU + + self.num_classes = num_classes + self.embed_dim = embed_dim + + # Added for SimMIM compatibility + self.num_features = embed_dim + self.in_chans = in_chans + self.patch_size = patch_size + + # Define the patch embedding + self.patch_embed = embed_layer( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim + ) + + # Define the class token + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + + # Define the positional embedding + embed_len = self.patch_embed.num_patches + 1 + self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * .02) + self.pos_drop = nn.Dropout(p=drop_rate) + self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity() + + # Define the stochastic depth decay rule based on drop path + # As the depth increases, the drop path rate increases, and finally reaches drop_path_rate + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] + + # Define the encoder blocks + self.blocks = nn.Sequential(*[ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer + ) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) + + # # Classifier Head + # # Head is removed because not used in SimMIM + # self.head = nn.Linear(self.embed_dim, num_classes) + + def _pos_embed(self, x): + # Concat the class token + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + + # Add the positional embedding (Need to be learned) + x = x + self.pos_embed + + # Drop out + x = self.pos_drop(x) + return x + + def forward_features(self, x): + # Patch embedding [B, c, h, w] -> [B, N-1, D] (Define N = h*w+1) + x = self.patch_embed(x) + + # Positional embedding [B, N-1, D] -> [B, N, D] + x = self._pos_embed(x) + + # Pre-normalize (Default not been used) [B, N, D] + x = self.norm_pre(x) + + # Transformer encoder blocks [B, N, D] + x = self.blocks(x) + + # Normalize (Default been used) [B, N, D] + x = self.norm(x) + return x + + def forward_head(self, x): + # Fetch the class token [B, N, D] -> [B, 1, D] + x = x[:, 0] + + # Fetch the head of the token [B, 1, D] -> [B, 1, number_of_classes] + # x = self.head(x) + x = nn.Linear(self.embed_dim, self.num_classes) + return x + + def forward(self, x): + # The main part of the ViT + x = self.forward_features(x) + + # Fetch result based on the class token + x = self.forward_head(x) + return x + diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/nohup.out b/PuzzleTuning/Counterpart PreTrain Methods/simmim/nohup.out new file mode 100644 index 0000000000000000000000000000000000000000..0e0cef66623b5af24e49c3a16852ce14210decad --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/nohup.out @@ -0,0 +1,3428 @@ +/root/miniconda3/lib/python3.8/site-packages/torch/distributed/launch.py:181: FutureWarning: The module torch.distributed.launch is deprecated +and will be removed in future. Use torchrun. +Note that --use-env is set by default in torchrun. +If your script expects `--local-rank` argument to be set, please +change it to read from `os.environ['LOCAL_RANK']` instead. See +https://pytorch.org/docs/stable/distributed.html#launch-utility for +further instructions + + warnings.warn( +=> merge config from ./configs/vit_base__test/simmim_pretrain__vit_base__img224__100ep.yaml +RANK and WORLD_SIZE in environ: 2/4 +=> merge config from ./configs/vit_base__test/simmim_pretrain__vit_base__img224__100ep.yaml +RANK and WORLD_SIZE in environ: 3/4 +=> merge config from ./configs/vit_base__test/simmim_pretrain__vit_base__img224__100ep.yaml +RANK and WORLD_SIZE in environ: 0/4 +=> merge config from ./configs/vit_base__test/simmim_pretrain__vit_base__img224__100ep.yaml +RANK and WORLD_SIZE in environ: 1/4 +[2023-10-10 03:03:53 simmim_pretrain](main_simmim.py 278): INFO Full config saved to /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/config.json +[2023-10-10 03:03:53 simmim_pretrain](main_simmim.py 281): INFO AMP_OPT_LEVEL: O1 +AUG: + AUTO_AUGMENT: rand-m9-mstd0.5-inc1 + COLOR_JITTER: 0.4 + CUTMIX: 1.0 + CUTMIX_MINMAX: null + MIXUP: 0.8 + MIXUP_MODE: batch + MIXUP_PROB: 1.0 + MIXUP_SWITCH_PROB: 0.5 + RECOUNT: 1 + REMODE: pixel + REPROB: 0.25 +BASE: +- '' +DATA: + BATCH_SIZE: 128 + DATASET: imagenet + DATA_PATH: /root/autodl-tmp/datasets/All + IMG_SIZE: 224 + INTERPOLATION: bicubic + MASK_PATCH_SIZE: 32 + MASK_RATIO: 0.6 + NUM_WORKERS: 8 + PIN_MEMORY: true +EVAL_MODE: false +LOCAL_RANK: 0 +MODEL: + DROP_PATH_RATE: 0.0 + DROP_RATE: 0.0 + LABEL_SMOOTHING: 0.1 + NAME: simmim_pretrain + NUM_CLASSES: 1000 + RESUME: '' + SWIN: + APE: false + DEPTHS: + - 2 + - 2 + - 6 + - 2 + EMBED_DIM: 96 + IN_CHANS: 3 + MLP_RATIO: 4.0 + NUM_HEADS: + - 3 + - 6 + - 12 + - 24 + PATCH_NORM: true + PATCH_SIZE: 4 + QKV_BIAS: true + QK_SCALE: null + WINDOW_SIZE: 7 + TYPE: vit + VIT: + DEPTH: 12 + EMBED_DIM: 768 + INIT_VALUES: null + IN_CHANS: 3 + MLP_RATIO: 4 + NUM_HEADS: 12 + PATCH_SIZE: 16 + QKV_BIAS: true + USE_APE: true + USE_MEAN_POOLING: false + USE_RPB: false + USE_SHARED_RPB: true +OUTPUT: /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim +PRETRAINED: '' +PRINT_FREQ: 500 +SAVE_FREQ: 20 +SEED: 0 +TAG: vit_simmim +TEST: + CROP: true +THROUGHPUT_MODE: false +TRAIN: + ACCUMULATION_STEPS: 0 + AUTO_RESUME: true + BASE_LR: 0.0002 + CLIP_GRAD: 5.0 + EPOCHS: 200 + LAYER_DECAY: 1.0 + LR_SCHEDULER: + DECAY_EPOCHS: 30 + DECAY_RATE: 0.1 + GAMMA: 0.1 + MULTISTEPS: + - 700 + NAME: multistep + MIN_LR: 5.0e-06 + OPTIMIZER: + BETAS: + - 0.9 + - 0.999 + EPS: 1.0e-08 + MOMENTUM: 0.9 + NAME: adamw + START_EPOCH: 0 + USE_CHECKPOINT: false + WARMUP_EPOCHS: 20 + WARMUP_LR: 1.0e-06 + WEIGHT_DECAY: 0.05 + +[2023-10-10 03:03:53 simmim_pretrain](data_simmim.py 96): INFO Pre-train data transform: + +[2023-10-10 03:04:01 simmim_pretrain](data_simmim.py 99): INFO Build dataset: train images = 3475344 +[2023-10-10 03:04:01 simmim_pretrain](main_simmim.py 103): INFO Creating model:vit/simmim_pretrain +Ignored all config about ViT! +Ignored all config about ViT! +Ignored all config about ViT! +Ignored all config about ViT! +loaded from pretrained weight +loaded from pretrained weight +loaded from pretrained weight +loaded from pretrained weight +[2023-10-10 03:04:02 simmim_pretrain](main_simmim.py 106): INFO SimMIM( + (encoder): VisionTransformerForSimMIM( + (patch_embed): PatchEmbed( + (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16)) + ) + (pos_drop): Dropout(p=0.0, inplace=False) + (norm_pre): Identity() + (blocks): Sequential( + (0): Block( + (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (attn): Attention( + (qkv): Linear(in_features=768, out_features=2304, bias=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=768, out_features=768, bias=True) + (proj_drop): Dropout(p=0.0, inplace=False) + ) + (drop_path): Identity() + (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): FFN( + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (act): GELU(approximate='none') + (fc2): Linear(in_features=3072, out_features=768, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): Block( + (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (attn): Attention( + (qkv): Linear(in_features=768, out_features=2304, bias=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=768, out_features=768, bias=True) + (proj_drop): Dropout(p=0.0, inplace=False) + ) + (drop_path): Identity() + (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): FFN( + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (act): GELU(approximate='none') + (fc2): Linear(in_features=3072, out_features=768, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): Block( + (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (attn): Attention( + (qkv): Linear(in_features=768, out_features=2304, bias=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=768, out_features=768, bias=True) + (proj_drop): Dropout(p=0.0, inplace=False) + ) + (drop_path): Identity() + (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): FFN( + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (act): GELU(approximate='none') + (fc2): Linear(in_features=3072, out_features=768, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): Block( + (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (attn): Attention( + (qkv): Linear(in_features=768, out_features=2304, bias=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=768, out_features=768, bias=True) + (proj_drop): Dropout(p=0.0, inplace=False) + ) + (drop_path): Identity() + (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): FFN( + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (act): GELU(approximate='none') + (fc2): Linear(in_features=3072, out_features=768, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): Block( + (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (attn): Attention( + (qkv): Linear(in_features=768, out_features=2304, bias=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=768, out_features=768, bias=True) + (proj_drop): Dropout(p=0.0, inplace=False) + ) + (drop_path): Identity() + (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): FFN( + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (act): GELU(approximate='none') + (fc2): Linear(in_features=3072, out_features=768, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): Block( + (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (attn): Attention( + (qkv): Linear(in_features=768, out_features=2304, bias=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=768, out_features=768, bias=True) + (proj_drop): Dropout(p=0.0, inplace=False) + ) + (drop_path): Identity() + (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): FFN( + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (act): GELU(approximate='none') + (fc2): Linear(in_features=3072, out_features=768, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (6): Block( + (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (attn): Attention( + (qkv): Linear(in_features=768, out_features=2304, bias=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=768, out_features=768, bias=True) + (proj_drop): Dropout(p=0.0, inplace=False) + ) + (drop_path): Identity() + (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): FFN( + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (act): GELU(approximate='none') + (fc2): Linear(in_features=3072, out_features=768, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (7): Block( + (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (attn): Attention( + (qkv): Linear(in_features=768, out_features=2304, bias=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=768, out_features=768, bias=True) + (proj_drop): Dropout(p=0.0, inplace=False) + ) + (drop_path): Identity() + (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): FFN( + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (act): GELU(approximate='none') + (fc2): Linear(in_features=3072, out_features=768, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (8): Block( + (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (attn): Attention( + (qkv): Linear(in_features=768, out_features=2304, bias=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=768, out_features=768, bias=True) + (proj_drop): Dropout(p=0.0, inplace=False) + ) + (drop_path): Identity() + (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): FFN( + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (act): GELU(approximate='none') + (fc2): Linear(in_features=3072, out_features=768, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (9): Block( + (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (attn): Attention( + (qkv): Linear(in_features=768, out_features=2304, bias=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=768, out_features=768, bias=True) + (proj_drop): Dropout(p=0.0, inplace=False) + ) + (drop_path): Identity() + (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): FFN( + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (act): GELU(approximate='none') + (fc2): Linear(in_features=3072, out_features=768, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (10): Block( + (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (attn): Attention( + (qkv): Linear(in_features=768, out_features=2304, bias=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=768, out_features=768, bias=True) + (proj_drop): Dropout(p=0.0, inplace=False) + ) + (drop_path): Identity() + (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): FFN( + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (act): GELU(approximate='none') + (fc2): Linear(in_features=3072, out_features=768, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (11): Block( + (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (attn): Attention( + (qkv): Linear(in_features=768, out_features=2304, bias=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=768, out_features=768, bias=True) + (proj_drop): Dropout(p=0.0, inplace=False) + ) + (drop_path): Identity() + (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): FFN( + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (act): GELU(approximate='none') + (fc2): Linear(in_features=3072, out_features=768, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + (norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + ) + (decoder): Sequential( + (0): Conv2d(768, 768, kernel_size=(1, 1), stride=(1, 1)) + (1): PixelShuffle(upscale_factor=16) + ) +) +[2023-10-10 03:04:02 simmim_pretrain](optimizer.py 22): INFO >>>>>>>>>> Build Optimizer for Pre-training Stage +[2023-10-10 03:04:02 simmim_pretrain](optimizer.py 27): INFO No weight decay: {} +[2023-10-10 03:04:02 simmim_pretrain](optimizer.py 30): INFO No weight decay keywords: {} +[2023-10-10 03:04:02 simmim_pretrain](optimizer.py 63): INFO No decay params: ['encoder.patch_embed.proj.bias', 'encoder.blocks.0.norm1.weight', 'encoder.blocks.0.norm1.bias', 'encoder.blocks.0.attn.qkv.bias', 'encoder.blocks.0.attn.proj.bias', 'encoder.blocks.0.norm2.weight', 'encoder.blocks.0.norm2.bias', 'encoder.blocks.0.mlp.fc1.bias', 'encoder.blocks.0.mlp.fc2.bias', 'encoder.blocks.1.norm1.weight', 'encoder.blocks.1.norm1.bias', 'encoder.blocks.1.attn.qkv.bias', 'encoder.blocks.1.attn.proj.bias', 'encoder.blocks.1.norm2.weight', 'encoder.blocks.1.norm2.bias', 'encoder.blocks.1.mlp.fc1.bias', 'encoder.blocks.1.mlp.fc2.bias', 'encoder.blocks.2.norm1.weight', 'encoder.blocks.2.norm1.bias', 'encoder.blocks.2.attn.qkv.bias', 'encoder.blocks.2.attn.proj.bias', 'encoder.blocks.2.norm2.weight', 'encoder.blocks.2.norm2.bias', 'encoder.blocks.2.mlp.fc1.bias', 'encoder.blocks.2.mlp.fc2.bias', 'encoder.blocks.3.norm1.weight', 'encoder.blocks.3.norm1.bias', 'encoder.blocks.3.attn.qkv.bias', 'encoder.blocks.3.attn.proj.bias', 'encoder.blocks.3.norm2.weight', 'encoder.blocks.3.norm2.bias', 'encoder.blocks.3.mlp.fc1.bias', 'encoder.blocks.3.mlp.fc2.bias', 'encoder.blocks.4.norm1.weight', 'encoder.blocks.4.norm1.bias', 'encoder.blocks.4.attn.qkv.bias', 'encoder.blocks.4.attn.proj.bias', 'encoder.blocks.4.norm2.weight', 'encoder.blocks.4.norm2.bias', 'encoder.blocks.4.mlp.fc1.bias', 'encoder.blocks.4.mlp.fc2.bias', 'encoder.blocks.5.norm1.weight', 'encoder.blocks.5.norm1.bias', 'encoder.blocks.5.attn.qkv.bias', 'encoder.blocks.5.attn.proj.bias', 'encoder.blocks.5.norm2.weight', 'encoder.blocks.5.norm2.bias', 'encoder.blocks.5.mlp.fc1.bias', 'encoder.blocks.5.mlp.fc2.bias', 'encoder.blocks.6.norm1.weight', 'encoder.blocks.6.norm1.bias', 'encoder.blocks.6.attn.qkv.bias', 'encoder.blocks.6.attn.proj.bias', 'encoder.blocks.6.norm2.weight', 'encoder.blocks.6.norm2.bias', 'encoder.blocks.6.mlp.fc1.bias', 'encoder.blocks.6.mlp.fc2.bias', 'encoder.blocks.7.norm1.weight', 'encoder.blocks.7.norm1.bias', 'encoder.blocks.7.attn.qkv.bias', 'encoder.blocks.7.attn.proj.bias', 'encoder.blocks.7.norm2.weight', 'encoder.blocks.7.norm2.bias', 'encoder.blocks.7.mlp.fc1.bias', 'encoder.blocks.7.mlp.fc2.bias', 'encoder.blocks.8.norm1.weight', 'encoder.blocks.8.norm1.bias', 'encoder.blocks.8.attn.qkv.bias', 'encoder.blocks.8.attn.proj.bias', 'encoder.blocks.8.norm2.weight', 'encoder.blocks.8.norm2.bias', 'encoder.blocks.8.mlp.fc1.bias', 'encoder.blocks.8.mlp.fc2.bias', 'encoder.blocks.9.norm1.weight', 'encoder.blocks.9.norm1.bias', 'encoder.blocks.9.attn.qkv.bias', 'encoder.blocks.9.attn.proj.bias', 'encoder.blocks.9.norm2.weight', 'encoder.blocks.9.norm2.bias', 'encoder.blocks.9.mlp.fc1.bias', 'encoder.blocks.9.mlp.fc2.bias', 'encoder.blocks.10.norm1.weight', 'encoder.blocks.10.norm1.bias', 'encoder.blocks.10.attn.qkv.bias', 'encoder.blocks.10.attn.proj.bias', 'encoder.blocks.10.norm2.weight', 'encoder.blocks.10.norm2.bias', 'encoder.blocks.10.mlp.fc1.bias', 'encoder.blocks.10.mlp.fc2.bias', 'encoder.blocks.11.norm1.weight', 'encoder.blocks.11.norm1.bias', 'encoder.blocks.11.attn.qkv.bias', 'encoder.blocks.11.attn.proj.bias', 'encoder.blocks.11.norm2.weight', 'encoder.blocks.11.norm2.bias', 'encoder.blocks.11.mlp.fc1.bias', 'encoder.blocks.11.mlp.fc2.bias', 'encoder.norm.weight', 'encoder.norm.bias', 'decoder.0.bias'] +[2023-10-10 03:04:02 simmim_pretrain](optimizer.py 64): INFO Has decay params: ['encoder.cls_token', 'encoder.pos_embed', 'encoder.mask_token', 'encoder.patch_embed.proj.weight', 'encoder.blocks.0.attn.qkv.weight', 'encoder.blocks.0.attn.proj.weight', 'encoder.blocks.0.mlp.fc1.weight', 'encoder.blocks.0.mlp.fc2.weight', 'encoder.blocks.1.attn.qkv.weight', 'encoder.blocks.1.attn.proj.weight', 'encoder.blocks.1.mlp.fc1.weight', 'encoder.blocks.1.mlp.fc2.weight', 'encoder.blocks.2.attn.qkv.weight', 'encoder.blocks.2.attn.proj.weight', 'encoder.blocks.2.mlp.fc1.weight', 'encoder.blocks.2.mlp.fc2.weight', 'encoder.blocks.3.attn.qkv.weight', 'encoder.blocks.3.attn.proj.weight', 'encoder.blocks.3.mlp.fc1.weight', 'encoder.blocks.3.mlp.fc2.weight', 'encoder.blocks.4.attn.qkv.weight', 'encoder.blocks.4.attn.proj.weight', 'encoder.blocks.4.mlp.fc1.weight', 'encoder.blocks.4.mlp.fc2.weight', 'encoder.blocks.5.attn.qkv.weight', 'encoder.blocks.5.attn.proj.weight', 'encoder.blocks.5.mlp.fc1.weight', 'encoder.blocks.5.mlp.fc2.weight', 'encoder.blocks.6.attn.qkv.weight', 'encoder.blocks.6.attn.proj.weight', 'encoder.blocks.6.mlp.fc1.weight', 'encoder.blocks.6.mlp.fc2.weight', 'encoder.blocks.7.attn.qkv.weight', 'encoder.blocks.7.attn.proj.weight', 'encoder.blocks.7.mlp.fc1.weight', 'encoder.blocks.7.mlp.fc2.weight', 'encoder.blocks.8.attn.qkv.weight', 'encoder.blocks.8.attn.proj.weight', 'encoder.blocks.8.mlp.fc1.weight', 'encoder.blocks.8.mlp.fc2.weight', 'encoder.blocks.9.attn.qkv.weight', 'encoder.blocks.9.attn.proj.weight', 'encoder.blocks.9.mlp.fc1.weight', 'encoder.blocks.9.mlp.fc2.weight', 'encoder.blocks.10.attn.qkv.weight', 'encoder.blocks.10.attn.proj.weight', 'encoder.blocks.10.mlp.fc1.weight', 'encoder.blocks.10.mlp.fc2.weight', 'encoder.blocks.11.attn.qkv.weight', 'encoder.blocks.11.attn.proj.weight', 'encoder.blocks.11.mlp.fc1.weight', 'encoder.blocks.11.mlp.fc2.weight', 'decoder.0.weight'] +[2023-10-10 03:04:02 simmim_pretrain](optimizer.py 43): INFO AdamW ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-08 + foreach: None + fused: None + lr: 0.0002 + maximize: False + weight_decay: 0.05 + +Parameter Group 1 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-08 + foreach: None + fused: None + lr: 0.0002 + maximize: False + weight_decay: 0.0 +) +[2023-10-10 03:04:02 simmim_pretrain](main_simmim.py 120): INFO number of params: 86390016 +[2023-10-10 03:04:02 simmim_pretrain](utils.py 83): INFO All checkpoints founded in /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim: [] +[2023-10-10 03:04:02 simmim_pretrain](main_simmim.py 137): INFO no checkpoint found in /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim, ignoring auto resume +[2023-10-10 03:04:02 simmim_pretrain](main_simmim.py 145): INFO Start training +/root/miniconda3/lib/python3.8/site-packages/torch/autograd/__init__.py:200: UserWarning: Grad strides do not match bucket view strides. This may indicate grad was not created according to the gradient layout contract, or that the param's strides changed since DDP was constructed. This is not an error, but may impair performance. +grad.sizes() = [768, 768, 1, 1], strides() = [768, 1, 768, 768] +bucket_view.sizes() = [768, 768, 1, 1], strides() = [768, 1, 1, 1] (Triggered internally at ../torch/csrc/distributed/c10d/reducer.cpp:323.) + Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +/root/miniconda3/lib/python3.8/site-packages/torch/autograd/__init__.py:200: UserWarning: Grad strides do not match bucket view strides. This may indicate grad was not created according to the gradient layout contract, or that the param's strides changed since DDP was constructed. This is not an error, but may impair performance. +grad.sizes() = [768, 768, 1, 1], strides() = [768, 1, 768, 768] +bucket_view.sizes() = [768, 768, 1, 1], strides() = [768, 1, 1, 1] (Triggered internally at ../torch/csrc/distributed/c10d/reducer.cpp:323.) + Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +/root/miniconda3/lib/python3.8/site-packages/torch/autograd/__init__.py:200: UserWarning: Grad strides do not match bucket view strides. This may indicate grad was not created according to the gradient layout contract, or that the param's strides changed since DDP was constructed. This is not an error, but may impair performance. +grad.sizes() = [768, 768, 1, 1], strides() = [768, 1, 768, 768] +bucket_view.sizes() = [768, 768, 1, 1], strides() = [768, 1, 1, 1] (Triggered internally at ../torch/csrc/distributed/c10d/reducer.cpp:323.) + Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +/root/miniconda3/lib/python3.8/site-packages/torch/autograd/__init__.py:200: UserWarning: Grad strides do not match bucket view strides. This may indicate grad was not created according to the gradient layout contract, or that the param's strides changed since DDP was constructed. This is not an error, but may impair performance. +grad.sizes() = [768, 768, 1, 1], strides() = [768, 1, 768, 768] +bucket_view.sizes() = [768, 768, 1, 1], strides() = [768, 1, 1, 1] (Triggered internally at ../torch/csrc/distributed/c10d/reducer.cpp:323.) + Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[2023-10-10 03:04:07 simmim_pretrain](main_simmim.py 218): INFO Train: [0/200][0/6787] eta 8:30:41 lr 0.000001 time 4.5148 (4.5148) loss 1.7152 (1.7152) grad_norm 553505.6250 (553505.6250) mem 13533MB +[2023-10-10 03:06:09 simmim_pretrain](main_simmim.py 218): INFO Train: [0/200][500/6787] eta 0:26:26 lr 0.000002 time 0.2444 (0.2523) loss 0.6251 (0.8950) grad_norm 25038.8047 (100184.2969) mem 14543MB +[2023-10-10 03:08:12 simmim_pretrain](main_simmim.py 218): INFO Train: [0/200][1000/6787] eta 0:24:04 lr 0.000002 time 0.2562 (0.2497) loss 0.5471 (0.7294) grad_norm 23971.2754 (64596.7500) mem 14543MB +[2023-10-10 03:10:17 simmim_pretrain](main_simmim.py 218): INFO Train: [0/200][1500/6787] eta 0:22:00 lr 0.000003 time 0.2493 (0.2498) loss 0.5397 (0.6632) grad_norm 59519.0664 (58565.0820) mem 14543MB +[2023-10-10 03:12:22 simmim_pretrain](main_simmim.py 218): INFO Train: [0/200][2000/6787] eta 0:19:56 lr 0.000004 time 0.2587 (0.2500) loss 0.5102 (0.6263) grad_norm 240317.0781 (71993.7969) mem 14543MB +[2023-10-10 03:14:28 simmim_pretrain](main_simmim.py 218): INFO Train: [0/200][2500/6787] eta 0:17:52 lr 0.000005 time 0.2461 (0.2501) loss 0.4838 (0.6021) grad_norm 154843.9688 (98793.8047) mem 14543MB +[2023-10-10 03:16:33 simmim_pretrain](main_simmim.py 218): INFO Train: [0/200][3000/6787] eta 0:15:47 lr 0.000005 time 0.2448 (0.2501) loss 0.5123 (0.5849) grad_norm 541804.3750 (134192.4844) mem 14543MB +[2023-10-10 03:18:38 simmim_pretrain](main_simmim.py 218): INFO Train: [0/200][3500/6787] eta 0:13:42 lr 0.000006 time 0.2503 (0.2502) loss 0.5012 (0.5723) grad_norm 501434.0938 (172352.0156) mem 14543MB +[2023-10-10 03:20:44 simmim_pretrain](main_simmim.py 218): INFO Train: [0/200][4000/6787] eta 0:11:37 lr 0.000007 time 0.2492 (0.2503) loss 0.4889 (0.5626) grad_norm 609606.8750 (195628.6562) mem 14543MB +[2023-10-10 03:22:50 simmim_pretrain](main_simmim.py 218): INFO Train: [0/200][4500/6787] eta 0:09:32 lr 0.000008 time 0.2598 (0.2504) loss 0.4657 (0.5546) grad_norm 821053.7500 (206329.3281) mem 14543MB +[2023-10-10 03:24:59 simmim_pretrain](main_simmim.py 218): INFO Train: [0/200][5000/6787] eta 0:07:29 lr 0.000008 time 0.2599 (0.2513) loss 0.4856 (0.5479) grad_norm 359824.9688 (230155.2344) mem 14543MB +[2023-10-10 03:27:05 simmim_pretrain](main_simmim.py 218): INFO Train: [0/200][5500/6787] eta 0:05:23 lr 0.000009 time 0.2514 (0.2514) loss 0.4633 (0.5422) grad_norm 816823.6875 (253836.1719) mem 14543MB +[2023-10-10 03:29:10 simmim_pretrain](main_simmim.py 218): INFO Train: [0/200][6000/6787] eta 0:03:17 lr 0.000010 time 0.2589 (0.2513) loss 0.4895 (0.5374) grad_norm 809127.5625 (266184.0938) mem 14543MB +[2023-10-10 03:31:16 simmim_pretrain](main_simmim.py 218): INFO Train: [0/200][6500/6787] eta 0:01:12 lr 0.000011 time 0.2519 (0.2513) loss 0.5012 (0.5330) grad_norm 980117.6875 (inf) mem 14543MB +[2023-10-10 03:32:28 simmim_pretrain](main_simmim.py 228): INFO EPOCH 0 training takes 0:28:26 +[2023-10-10 03:32:28 simmim_pretrain](utils.py 62): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_0.pth saving...... +[2023-10-10 03:32:29 simmim_pretrain](utils.py 64): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_0.pth saved !!! +[2023-10-10 03:32:30 simmim_pretrain](main_simmim.py 218): INFO Train: [1/200][0/6787] eta 2:32:49 lr 0.000011 time 1.3511 (1.3511) loss 0.4772 (0.4772) grad_norm 144767.9062 (144767.9062) mem 14543MB +[2023-10-10 03:34:36 simmim_pretrain](main_simmim.py 218): INFO Train: [1/200][500/6787] eta 0:26:27 lr 0.000012 time 0.2545 (0.2525) loss 0.4586 (0.4690) grad_norm 535874.0625 (665340.0625) mem 14543MB +[2023-10-10 03:36:41 simmim_pretrain](main_simmim.py 218): INFO Train: [1/200][1000/6787] eta 0:24:17 lr 0.000012 time 0.2483 (0.2518) loss 0.4475 (0.4653) grad_norm 960475.7500 (644721.1875) mem 14543MB +[2023-10-10 03:38:46 simmim_pretrain](main_simmim.py 218): INFO Train: [1/200][1500/6787] eta 0:22:09 lr 0.000013 time 0.2486 (0.2514) loss 0.4601 (0.4622) grad_norm 539256.4375 (648973.9375) mem 14543MB +[2023-10-10 03:40:52 simmim_pretrain](main_simmim.py 218): INFO Train: [1/200][2000/6787] eta 0:20:02 lr 0.000014 time 0.2541 (0.2511) loss 0.4288 (0.4594) grad_norm 710371.8750 (712513.4375) mem 14543MB +[2023-10-10 03:42:57 simmim_pretrain](main_simmim.py 218): INFO Train: [1/200][2500/6787] eta 0:17:56 lr 0.000015 time 0.2554 (0.2510) loss 0.4711 (0.4572) grad_norm 764299.3125 (719295.8750) mem 14543MB +[2023-10-10 03:45:02 simmim_pretrain](main_simmim.py 218): INFO Train: [1/200][3000/6787] eta 0:15:50 lr 0.000015 time 0.2468 (0.2509) loss 0.4599 (0.4552) grad_norm 511418.3125 (765126.8125) mem 14543MB +[2023-10-10 03:47:07 simmim_pretrain](main_simmim.py 218): INFO Train: [1/200][3500/6787] eta 0:13:44 lr 0.000016 time 0.2505 (0.2509) loss 0.4501 (0.4535) grad_norm 1814348.5000 (inf) mem 14543MB +[2023-10-10 03:49:13 simmim_pretrain](main_simmim.py 218): INFO Train: [1/200][4000/6787] eta 0:11:39 lr 0.000017 time 0.2503 (0.2508) loss 0.4399 (0.4522) grad_norm 1776759.3750 (inf) mem 14543MB +[2023-10-10 03:51:18 simmim_pretrain](main_simmim.py 218): INFO Train: [1/200][4500/6787] eta 0:09:33 lr 0.000018 time 0.2514 (0.2507) loss 0.4322 (0.4509) grad_norm 922950.3125 (inf) mem 14543MB +[2023-10-10 03:53:23 simmim_pretrain](main_simmim.py 218): INFO Train: [1/200][5000/6787] eta 0:07:27 lr 0.000018 time 0.2493 (0.2507) loss 0.4397 (0.4497) grad_norm 1040136.3750 (inf) mem 14543MB +[2023-10-10 03:55:28 simmim_pretrain](main_simmim.py 218): INFO Train: [1/200][5500/6787] eta 0:05:22 lr 0.000019 time 0.2485 (0.2507) loss 0.4340 (0.4487) grad_norm 647195.6250 (inf) mem 14543MB +[2023-10-10 03:57:33 simmim_pretrain](main_simmim.py 218): INFO Train: [1/200][6000/6787] eta 0:03:17 lr 0.000020 time 0.2502 (0.2507) loss 0.4386 (0.4477) grad_norm 1406970.0000 (inf) mem 14543MB +[2023-10-10 03:59:39 simmim_pretrain](main_simmim.py 218): INFO Train: [1/200][6500/6787] eta 0:01:11 lr 0.000020 time 0.2488 (0.2507) loss 0.4304 (0.4467) grad_norm 487872.5938 (inf) mem 14543MB +[2023-10-10 04:00:51 simmim_pretrain](main_simmim.py 228): INFO EPOCH 1 training takes 0:28:22 +[2023-10-10 04:00:53 simmim_pretrain](main_simmim.py 218): INFO Train: [2/200][0/6787] eta 2:35:04 lr 0.000021 time 1.3709 (1.3709) loss 0.4393 (0.4393) grad_norm 659639.8125 (659639.8125) mem 14543MB +[2023-10-10 04:02:57 simmim_pretrain](main_simmim.py 218): INFO Train: [2/200][500/6787] eta 0:26:23 lr 0.000022 time 0.2462 (0.2518) loss 0.4335 (0.4324) grad_norm 872738.8125 (inf) mem 14543MB +[2023-10-10 04:05:03 simmim_pretrain](main_simmim.py 218): INFO Train: [2/200][1000/6787] eta 0:24:13 lr 0.000022 time 0.2536 (0.2512) loss 0.4051 (0.4329) grad_norm 1444892.6250 (inf) mem 14543MB +[2023-10-10 04:07:08 simmim_pretrain](main_simmim.py 218): INFO Train: [2/200][1500/6787] eta 0:22:06 lr 0.000023 time 0.2511 (0.2510) loss 0.4447 (0.4324) grad_norm 728204.2500 (inf) mem 14543MB +[2023-10-10 04:09:14 simmim_pretrain](main_simmim.py 218): INFO Train: [2/200][2000/6787] eta 0:20:01 lr 0.000024 time 0.2473 (0.2511) loss 0.4198 (0.4320) grad_norm 922745.8750 (inf) mem 14543MB +[2023-10-10 04:11:19 simmim_pretrain](main_simmim.py 218): INFO Train: [2/200][2500/6787] eta 0:17:56 lr 0.000025 time 0.2559 (0.2512) loss 0.4247 (0.4317) grad_norm 1370773.6250 (inf) mem 14543MB +[2023-10-10 04:13:25 simmim_pretrain](main_simmim.py 218): INFO Train: [2/200][3000/6787] eta 0:15:51 lr 0.000025 time 0.2496 (0.2513) loss 0.3997 (0.4312) grad_norm 585347.3750 (inf) mem 14543MB +[2023-10-10 04:15:31 simmim_pretrain](main_simmim.py 218): INFO Train: [2/200][3500/6787] eta 0:13:46 lr 0.000026 time 0.2492 (0.2514) loss 0.4111 (0.4308) grad_norm 592507.3125 (inf) mem 14543MB +[2023-10-10 04:17:38 simmim_pretrain](main_simmim.py 218): INFO Train: [2/200][4000/6787] eta 0:11:40 lr 0.000027 time 0.2577 (0.2515) loss 0.4029 (0.4304) grad_norm 939298.0625 (inf) mem 14543MB +[2023-10-10 04:19:44 simmim_pretrain](main_simmim.py 218): INFO Train: [2/200][4500/6787] eta 0:09:35 lr 0.000027 time 0.2520 (0.2516) loss 0.4360 (0.4301) grad_norm 746308.8750 (inf) mem 14543MB +[2023-10-10 04:21:50 simmim_pretrain](main_simmim.py 218): INFO Train: [2/200][5000/6787] eta 0:07:29 lr 0.000028 time 0.2493 (0.2516) loss 0.4662 (0.4298) grad_norm 573671.5625 (inf) mem 14543MB +[2023-10-10 04:23:55 simmim_pretrain](main_simmim.py 218): INFO Train: [2/200][5500/6787] eta 0:05:23 lr 0.000029 time 0.2523 (0.2516) loss 0.4098 (0.4294) grad_norm 2750094.2500 (inf) mem 14543MB +[2023-10-10 04:26:01 simmim_pretrain](main_simmim.py 218): INFO Train: [2/200][6000/6787] eta 0:03:17 lr 0.000030 time 0.2487 (0.2516) loss 0.4324 (0.4292) grad_norm 1130291.7500 (inf) mem 14543MB +[2023-10-10 04:28:07 simmim_pretrain](main_simmim.py 218): INFO Train: [2/200][6500/6787] eta 0:01:12 lr 0.000030 time 0.2520 (0.2515) loss 0.4199 (0.4290) grad_norm 529819.1875 (inf) mem 14543MB +[2023-10-10 04:29:19 simmim_pretrain](main_simmim.py 228): INFO EPOCH 2 training takes 0:28:27 +[2023-10-10 04:29:21 simmim_pretrain](main_simmim.py 218): INFO Train: [3/200][0/6787] eta 2:32:21 lr 0.000031 time 1.3469 (1.3469) loss 0.4341 (0.4341) grad_norm 794711.5000 (794711.5000) mem 14543MB +[2023-10-10 04:31:26 simmim_pretrain](main_simmim.py 218): INFO Train: [3/200][500/6787] eta 0:26:31 lr 0.000032 time 0.2562 (0.2531) loss 0.4010 (0.4247) grad_norm 601396.8750 (687885.3125) mem 14543MB +[2023-10-10 04:33:32 simmim_pretrain](main_simmim.py 218): INFO Train: [3/200][1000/6787] eta 0:24:19 lr 0.000032 time 0.2506 (0.2522) loss 0.4401 (0.4241) grad_norm 709212.1250 (711774.1250) mem 14543MB +[2023-10-10 04:35:37 simmim_pretrain](main_simmim.py 218): INFO Train: [3/200][1500/6787] eta 0:22:10 lr 0.000033 time 0.2517 (0.2517) loss 0.4268 (0.4241) grad_norm 403870.3125 (764413.6875) mem 14543MB +[2023-10-10 04:37:42 simmim_pretrain](main_simmim.py 218): INFO Train: [3/200][2000/6787] eta 0:20:03 lr 0.000034 time 0.2501 (0.2515) loss 0.4287 (0.4238) grad_norm 1236570.1250 (inf) mem 14543MB +[2023-10-10 04:39:48 simmim_pretrain](main_simmim.py 218): INFO Train: [3/200][2500/6787] eta 0:17:57 lr 0.000035 time 0.2558 (0.2514) loss 0.4235 (0.4238) grad_norm 465070.2188 (inf) mem 14543MB +[2023-10-10 04:41:53 simmim_pretrain](main_simmim.py 218): INFO Train: [3/200][3000/6787] eta 0:15:51 lr 0.000035 time 0.2520 (0.2513) loss 0.4204 (0.4235) grad_norm 961153.3125 (inf) mem 14543MB +[2023-10-10 04:43:59 simmim_pretrain](main_simmim.py 218): INFO Train: [3/200][3500/6787] eta 0:13:45 lr 0.000036 time 0.2535 (0.2512) loss 0.4419 (0.4232) grad_norm 600768.2500 (inf) mem 14543MB +[2023-10-10 04:46:04 simmim_pretrain](main_simmim.py 218): INFO Train: [3/200][4000/6787] eta 0:11:39 lr 0.000037 time 0.2496 (0.2512) loss 0.4316 (0.4232) grad_norm 530462.1250 (inf) mem 14543MB +[2023-10-10 04:48:09 simmim_pretrain](main_simmim.py 218): INFO Train: [3/200][4500/6787] eta 0:09:34 lr 0.000037 time 0.2522 (0.2511) loss 0.4503 (0.4227) grad_norm 399515.7500 (inf) mem 14543MB +[2023-10-10 04:50:15 simmim_pretrain](main_simmim.py 218): INFO Train: [3/200][5000/6787] eta 0:07:28 lr 0.000038 time 0.2537 (0.2510) loss 0.4279 (0.4223) grad_norm 513325.5000 (inf) mem 14543MB +[2023-10-10 04:52:20 simmim_pretrain](main_simmim.py 218): INFO Train: [3/200][5500/6787] eta 0:05:23 lr 0.000039 time 0.2517 (0.2510) loss 0.4282 (0.4220) grad_norm 868282.9375 (inf) mem 14543MB +[2023-10-10 04:54:25 simmim_pretrain](main_simmim.py 218): INFO Train: [3/200][6000/6787] eta 0:03:17 lr 0.000040 time 0.2473 (0.2510) loss 0.4171 (0.4216) grad_norm 571394.0000 (inf) mem 14543MB +[2023-10-10 04:56:31 simmim_pretrain](main_simmim.py 218): INFO Train: [3/200][6500/6787] eta 0:01:12 lr 0.000040 time 0.2577 (0.2510) loss 0.4230 (0.4212) grad_norm 919010.5000 (inf) mem 14543MB +[2023-10-10 04:57:43 simmim_pretrain](main_simmim.py 228): INFO EPOCH 3 training takes 0:28:23 +[2023-10-10 04:57:44 simmim_pretrain](main_simmim.py 218): INFO Train: [4/200][0/6787] eta 2:34:16 lr 0.000041 time 1.3639 (1.3639) loss 0.4357 (0.4357) grad_norm 661257.6250 (661257.6250) mem 14543MB +[2023-10-10 04:59:49 simmim_pretrain](main_simmim.py 218): INFO Train: [4/200][500/6787] eta 0:26:24 lr 0.000042 time 0.2461 (0.2521) loss 0.4066 (0.4155) grad_norm 774128.5625 (671610.5000) mem 14543MB +[2023-10-10 05:01:54 simmim_pretrain](main_simmim.py 218): INFO Train: [4/200][1000/6787] eta 0:24:14 lr 0.000042 time 0.2503 (0.2513) loss 0.4002 (0.4164) grad_norm 274338.7500 (nan) mem 14543MB +[2023-10-10 05:04:00 simmim_pretrain](main_simmim.py 218): INFO Train: [4/200][1500/6787] eta 0:22:07 lr 0.000043 time 0.2459 (0.2511) loss 0.3930 (0.4164) grad_norm 233489.1562 (nan) mem 14543MB +[2023-10-10 05:06:05 simmim_pretrain](main_simmim.py 218): INFO Train: [4/200][2000/6787] eta 0:20:01 lr 0.000044 time 0.2518 (0.2511) loss 0.4203 (0.4162) grad_norm 322065.9688 (nan) mem 14543MB +[2023-10-10 05:08:11 simmim_pretrain](main_simmim.py 218): INFO Train: [4/200][2500/6787] eta 0:17:56 lr 0.000044 time 0.2514 (0.2511) loss 0.4085 (0.4157) grad_norm 197069.8906 (nan) mem 14543MB +[2023-10-10 05:10:16 simmim_pretrain](main_simmim.py 218): INFO Train: [4/200][3000/6787] eta 0:15:50 lr 0.000045 time 0.2458 (0.2510) loss 0.4203 (0.4153) grad_norm 397812.2500 (nan) mem 14543MB +[2023-10-10 05:12:22 simmim_pretrain](main_simmim.py 218): INFO Train: [4/200][3500/6787] eta 0:13:45 lr 0.000046 time 0.2501 (0.2510) loss 0.3780 (0.4148) grad_norm 517283.5312 (nan) mem 14543MB +[2023-10-10 05:14:28 simmim_pretrain](main_simmim.py 218): INFO Train: [4/200][4000/6787] eta 0:11:39 lr 0.000047 time 0.2540 (0.2511) loss 0.4296 (0.4143) grad_norm 309018.5000 (nan) mem 14543MB +[2023-10-10 05:16:33 simmim_pretrain](main_simmim.py 218): INFO Train: [4/200][4500/6787] eta 0:09:34 lr 0.000047 time 0.2561 (0.2512) loss 0.4167 (0.4138) grad_norm 505430.5000 (nan) mem 14543MB +[2023-10-10 05:18:39 simmim_pretrain](main_simmim.py 218): INFO Train: [4/200][5000/6787] eta 0:07:28 lr 0.000048 time 0.2596 (0.2512) loss 0.4021 (0.4132) grad_norm 410150.2188 (nan) mem 14543MB +[2023-10-10 05:20:45 simmim_pretrain](main_simmim.py 218): INFO Train: [4/200][5500/6787] eta 0:05:23 lr 0.000049 time 0.2457 (0.2512) loss 0.3934 (0.4127) grad_norm 854229.9375 (nan) mem 14543MB +[2023-10-10 05:22:50 simmim_pretrain](main_simmim.py 218): INFO Train: [4/200][6000/6787] eta 0:03:17 lr 0.000050 time 0.2588 (0.2511) loss 0.4204 (0.4122) grad_norm 1532492.7500 (nan) mem 14543MB +[2023-10-10 05:24:55 simmim_pretrain](main_simmim.py 218): INFO Train: [4/200][6500/6787] eta 0:01:12 lr 0.000050 time 0.2514 (0.2511) loss 0.4155 (0.4118) grad_norm 491622.1562 (nan) mem 14543MB +[2023-10-10 05:26:08 simmim_pretrain](main_simmim.py 228): INFO EPOCH 4 training takes 0:28:24 +[2023-10-10 05:26:09 simmim_pretrain](main_simmim.py 218): INFO Train: [5/200][0/6787] eta 2:28:05 lr 0.000051 time 1.3092 (1.3092) loss 0.3984 (0.3984) grad_norm 578239.0625 (578239.0625) mem 14543MB +[2023-10-10 05:28:14 simmim_pretrain](main_simmim.py 218): INFO Train: [5/200][500/6787] eta 0:26:27 lr 0.000051 time 0.2496 (0.2525) loss 0.4156 (0.4047) grad_norm 390009.2500 (515957.2188) mem 14543MB +[2023-10-10 05:30:20 simmim_pretrain](main_simmim.py 218): INFO Train: [5/200][1000/6787] eta 0:24:17 lr 0.000052 time 0.2547 (0.2519) loss 0.3872 (0.4049) grad_norm 904920.0000 (521818.1562) mem 14543MB +[2023-10-10 05:32:25 simmim_pretrain](main_simmim.py 218): INFO Train: [5/200][1500/6787] eta 0:22:07 lr 0.000053 time 0.2457 (0.2511) loss 0.3968 (0.4047) grad_norm 943757.4375 (557525.3125) mem 14543MB +[2023-10-10 05:34:30 simmim_pretrain](main_simmim.py 218): INFO Train: [5/200][2000/6787] eta 0:20:00 lr 0.000054 time 0.2535 (0.2508) loss 0.4031 (0.4044) grad_norm 814286.0625 (582936.7500) mem 14543MB +[2023-10-10 05:36:35 simmim_pretrain](main_simmim.py 218): INFO Train: [5/200][2500/6787] eta 0:17:54 lr 0.000054 time 0.2540 (0.2507) loss 0.4162 (0.4042) grad_norm 487653.4062 (605626.3750) mem 14543MB +[2023-10-10 05:38:40 simmim_pretrain](main_simmim.py 218): INFO Train: [5/200][3000/6787] eta 0:15:49 lr 0.000055 time 0.2470 (0.2506) loss 0.3849 (0.4038) grad_norm 422073.7188 (inf) mem 14543MB +[2023-10-10 05:40:45 simmim_pretrain](main_simmim.py 218): INFO Train: [5/200][3500/6787] eta 0:13:43 lr 0.000056 time 0.2520 (0.2506) loss 0.4068 (0.4036) grad_norm 520833.2500 (inf) mem 14543MB +[2023-10-10 05:42:50 simmim_pretrain](main_simmim.py 218): INFO Train: [5/200][4000/6787] eta 0:11:38 lr 0.000057 time 0.2447 (0.2506) loss 0.4007 (0.4034) grad_norm 401273.1250 (inf) mem 14543MB +[2023-10-10 05:44:55 simmim_pretrain](main_simmim.py 218): INFO Train: [5/200][4500/6787] eta 0:09:32 lr 0.000057 time 0.2544 (0.2505) loss 0.3900 (0.4033) grad_norm 527926.8125 (inf) mem 14543MB +[2023-10-10 05:47:01 simmim_pretrain](main_simmim.py 218): INFO Train: [5/200][5000/6787] eta 0:07:27 lr 0.000058 time 0.2523 (0.2505) loss 0.3939 (0.4032) grad_norm 721252.0000 (inf) mem 14543MB +[2023-10-10 05:49:06 simmim_pretrain](main_simmim.py 218): INFO Train: [5/200][5500/6787] eta 0:05:22 lr 0.000059 time 0.2524 (0.2505) loss 0.3907 (0.4030) grad_norm 570211.2500 (inf) mem 14543MB +[2023-10-10 05:51:11 simmim_pretrain](main_simmim.py 218): INFO Train: [5/200][6000/6787] eta 0:03:17 lr 0.000060 time 0.2588 (0.2505) loss 0.4046 (0.4028) grad_norm 538420.2500 (inf) mem 14543MB +[2023-10-10 05:53:17 simmim_pretrain](main_simmim.py 218): INFO Train: [5/200][6500/6787] eta 0:01:11 lr 0.000060 time 0.2486 (0.2506) loss 0.4047 (0.4024) grad_norm 555881.3750 (inf) mem 14543MB +[2023-10-10 05:54:29 simmim_pretrain](main_simmim.py 228): INFO EPOCH 5 training takes 0:28:21 +[2023-10-10 05:54:31 simmim_pretrain](main_simmim.py 218): INFO Train: [6/200][0/6787] eta 2:38:55 lr 0.000061 time 1.4049 (1.4049) loss 0.3786 (0.3786) grad_norm 523171.9375 (523171.9375) mem 14543MB +[2023-10-10 05:56:36 simmim_pretrain](main_simmim.py 218): INFO Train: [6/200][500/6787] eta 0:26:28 lr 0.000061 time 0.2455 (0.2527) loss 0.4249 (0.3978) grad_norm 772955.5000 (582496.6875) mem 14543MB +[2023-10-10 05:58:41 simmim_pretrain](main_simmim.py 218): INFO Train: [6/200][1000/6787] eta 0:24:17 lr 0.000062 time 0.2521 (0.2519) loss 0.3953 (0.3980) grad_norm 493784.6875 (613746.7500) mem 14543MB +[2023-10-10 06:00:47 simmim_pretrain](main_simmim.py 218): INFO Train: [6/200][1500/6787] eta 0:22:10 lr 0.000063 time 0.2495 (0.2516) loss 0.3997 (0.3980) grad_norm 631632.8750 (inf) mem 14543MB +[2023-10-10 06:02:52 simmim_pretrain](main_simmim.py 218): INFO Train: [6/200][2000/6787] eta 0:20:03 lr 0.000064 time 0.2463 (0.2515) loss 0.4048 (0.3977) grad_norm 246547.5156 (inf) mem 14543MB +[2023-10-10 06:04:58 simmim_pretrain](main_simmim.py 218): INFO Train: [6/200][2500/6787] eta 0:17:57 lr 0.000064 time 0.2526 (0.2514) loss 0.3726 (0.3976) grad_norm 428185.0312 (inf) mem 14543MB +[2023-10-10 06:07:03 simmim_pretrain](main_simmim.py 218): INFO Train: [6/200][3000/6787] eta 0:15:51 lr 0.000065 time 0.2465 (0.2513) loss 0.3938 (0.3975) grad_norm 538215.9375 (inf) mem 14543MB +[2023-10-10 06:09:09 simmim_pretrain](main_simmim.py 218): INFO Train: [6/200][3500/6787] eta 0:13:45 lr 0.000066 time 0.2511 (0.2512) loss 0.3874 (0.3972) grad_norm 542470.0000 (inf) mem 14543MB +[2023-10-10 06:11:14 simmim_pretrain](main_simmim.py 218): INFO Train: [6/200][4000/6787] eta 0:11:39 lr 0.000067 time 0.2470 (0.2511) loss 0.3968 (0.3970) grad_norm 662246.0000 (inf) mem 14543MB +[2023-10-10 06:13:20 simmim_pretrain](main_simmim.py 218): INFO Train: [6/200][4500/6787] eta 0:09:34 lr 0.000067 time 0.2465 (0.2511) loss 0.3794 (0.3967) grad_norm 518386.2812 (inf) mem 14543MB +[2023-10-10 06:15:25 simmim_pretrain](main_simmim.py 218): INFO Train: [6/200][5000/6787] eta 0:07:28 lr 0.000068 time 0.2447 (0.2511) loss 0.3752 (0.3966) grad_norm 489676.6250 (inf) mem 14543MB +[2023-10-10 06:17:30 simmim_pretrain](main_simmim.py 218): INFO Train: [6/200][5500/6787] eta 0:05:23 lr 0.000069 time 0.2504 (0.2510) loss 0.3967 (0.3964) grad_norm 667977.8125 (inf) mem 14543MB +[2023-10-10 06:19:35 simmim_pretrain](main_simmim.py 218): INFO Train: [6/200][6000/6787] eta 0:03:17 lr 0.000069 time 0.2541 (0.2510) loss 0.3923 (0.3961) grad_norm 547869.0000 (inf) mem 14543MB +[2023-10-10 06:21:41 simmim_pretrain](main_simmim.py 218): INFO Train: [6/200][6500/6787] eta 0:01:12 lr 0.000070 time 0.2496 (0.2510) loss 0.4107 (0.3959) grad_norm 792803.1250 (inf) mem 14543MB +[2023-10-10 06:22:53 simmim_pretrain](main_simmim.py 228): INFO EPOCH 6 training takes 0:28:24 +[2023-10-10 06:22:55 simmim_pretrain](main_simmim.py 218): INFO Train: [7/200][0/6787] eta 2:32:16 lr 0.000071 time 1.3461 (1.3461) loss 0.3952 (0.3952) grad_norm 719464.4375 (719464.4375) mem 14543MB +[2023-10-10 06:25:00 simmim_pretrain](main_simmim.py 218): INFO Train: [7/200][500/6787] eta 0:26:29 lr 0.000071 time 0.2512 (0.2528) loss 0.3729 (0.3929) grad_norm 400657.5000 (725651.1250) mem 14543MB +[2023-10-10 06:27:05 simmim_pretrain](main_simmim.py 218): INFO Train: [7/200][1000/6787] eta 0:24:16 lr 0.000072 time 0.2507 (0.2517) loss 0.4076 (0.3936) grad_norm 420490.4375 (inf) mem 14543MB +[2023-10-10 06:29:11 simmim_pretrain](main_simmim.py 218): INFO Train: [7/200][1500/6787] eta 0:22:08 lr 0.000073 time 0.2477 (0.2513) loss 0.3962 (0.3933) grad_norm 409381.5312 (inf) mem 14543MB +[2023-10-10 06:31:16 simmim_pretrain](main_simmim.py 218): INFO Train: [7/200][2000/6787] eta 0:20:02 lr 0.000074 time 0.2521 (0.2511) loss 0.4027 (0.3934) grad_norm 650768.6875 (inf) mem 14543MB +[2023-10-10 06:33:21 simmim_pretrain](main_simmim.py 218): INFO Train: [7/200][2500/6787] eta 0:17:56 lr 0.000074 time 0.2519 (0.2511) loss 0.3823 (0.3932) grad_norm 423769.8125 (inf) mem 14543MB +[2023-10-10 06:35:27 simmim_pretrain](main_simmim.py 218): INFO Train: [7/200][3000/6787] eta 0:15:50 lr 0.000075 time 0.2556 (0.2511) loss 0.3847 (0.3928) grad_norm 785393.5000 (inf) mem 14543MB +[2023-10-10 06:37:32 simmim_pretrain](main_simmim.py 218): INFO Train: [7/200][3500/6787] eta 0:13:45 lr 0.000076 time 0.2523 (0.2511) loss 0.3747 (0.3927) grad_norm 567105.4375 (inf) mem 14543MB +[2023-10-10 06:39:38 simmim_pretrain](main_simmim.py 218): INFO Train: [7/200][4000/6787] eta 0:11:39 lr 0.000077 time 0.2543 (0.2510) loss 0.3926 (0.3927) grad_norm 563567.3125 (inf) mem 14543MB +[2023-10-10 06:41:43 simmim_pretrain](main_simmim.py 218): INFO Train: [7/200][4500/6787] eta 0:09:34 lr 0.000077 time 0.2515 (0.2510) loss 0.3847 (0.3925) grad_norm 639863.4375 (inf) mem 14543MB +[2023-10-10 06:43:49 simmim_pretrain](main_simmim.py 218): INFO Train: [7/200][5000/6787] eta 0:07:28 lr 0.000078 time 0.2466 (0.2510) loss 0.3835 (0.3924) grad_norm 266915.2812 (inf) mem 14543MB +[2023-10-10 06:45:54 simmim_pretrain](main_simmim.py 218): INFO Train: [7/200][5500/6787] eta 0:05:23 lr 0.000079 time 0.2496 (0.2510) loss 0.3880 (0.3922) grad_norm 526473.0625 (inf) mem 14543MB +[2023-10-10 06:48:00 simmim_pretrain](main_simmim.py 218): INFO Train: [7/200][6000/6787] eta 0:03:17 lr 0.000079 time 0.2510 (0.2510) loss 0.3696 (0.3920) grad_norm 810871.0000 (inf) mem 14543MB +[2023-10-10 06:50:05 simmim_pretrain](main_simmim.py 218): INFO Train: [7/200][6500/6787] eta 0:01:12 lr 0.000080 time 0.2573 (0.2510) loss 0.3907 (0.3918) grad_norm 390052.0625 (inf) mem 14543MB +[2023-10-10 06:51:18 simmim_pretrain](main_simmim.py 228): INFO EPOCH 7 training takes 0:28:24 +[2023-10-10 06:51:19 simmim_pretrain](main_simmim.py 218): INFO Train: [8/200][0/6787] eta 2:29:18 lr 0.000081 time 1.3200 (1.3200) loss 0.4024 (0.4024) grad_norm 462567.0938 (462567.0938) mem 14543MB +[2023-10-10 06:53:24 simmim_pretrain](main_simmim.py 218): INFO Train: [8/200][500/6787] eta 0:26:28 lr 0.000081 time 0.2499 (0.2527) loss 0.3791 (0.3907) grad_norm 581210.1875 (inf) mem 14543MB +[2023-10-10 06:55:30 simmim_pretrain](main_simmim.py 218): INFO Train: [8/200][1000/6787] eta 0:24:17 lr 0.000082 time 0.2506 (0.2519) loss 0.3972 (0.3905) grad_norm 479175.9062 (inf) mem 14543MB +[2023-10-10 06:57:36 simmim_pretrain](main_simmim.py 218): INFO Train: [8/200][1500/6787] eta 0:22:10 lr 0.000083 time 0.2520 (0.2516) loss 0.3999 (0.3902) grad_norm 466565.5938 (inf) mem 14543MB +[2023-10-10 06:59:41 simmim_pretrain](main_simmim.py 218): INFO Train: [8/200][2000/6787] eta 0:20:04 lr 0.000084 time 0.2565 (0.2516) loss 0.3970 (0.3901) grad_norm 299479.5625 (inf) mem 14543MB +[2023-10-10 07:01:47 simmim_pretrain](main_simmim.py 218): INFO Train: [8/200][2500/6787] eta 0:17:58 lr 0.000084 time 0.2592 (0.2515) loss 0.3868 (0.3900) grad_norm 555698.1250 (inf) mem 14543MB +[2023-10-10 07:03:52 simmim_pretrain](main_simmim.py 218): INFO Train: [8/200][3000/6787] eta 0:15:52 lr 0.000085 time 0.2514 (0.2515) loss 0.4017 (0.3901) grad_norm 632740.1875 (inf) mem 14543MB +[2023-10-10 07:05:58 simmim_pretrain](main_simmim.py 218): INFO Train: [8/200][3500/6787] eta 0:13:46 lr 0.000086 time 0.2549 (0.2514) loss 0.3963 (0.3898) grad_norm 455604.8438 (inf) mem 14543MB +[2023-10-10 07:08:04 simmim_pretrain](main_simmim.py 218): INFO Train: [8/200][4000/6787] eta 0:11:40 lr 0.000086 time 0.2590 (0.2514) loss 0.3721 (0.3896) grad_norm 485743.6875 (inf) mem 14543MB +[2023-10-10 07:10:09 simmim_pretrain](main_simmim.py 218): INFO Train: [8/200][4500/6787] eta 0:09:34 lr 0.000087 time 0.2587 (0.2514) loss 0.3873 (0.3893) grad_norm 326329.6250 (inf) mem 14543MB +[2023-10-10 07:12:15 simmim_pretrain](main_simmim.py 218): INFO Train: [8/200][5000/6787] eta 0:07:29 lr 0.000088 time 0.2525 (0.2513) loss 0.3877 (0.3890) grad_norm 401584.0625 (inf) mem 14543MB +[2023-10-10 07:14:20 simmim_pretrain](main_simmim.py 218): INFO Train: [8/200][5500/6787] eta 0:05:23 lr 0.000089 time 0.2459 (0.2512) loss 0.4064 (0.3892) grad_norm 189235.9375 (inf) mem 14543MB +[2023-10-10 07:16:25 simmim_pretrain](main_simmim.py 218): INFO Train: [8/200][6000/6787] eta 0:03:17 lr 0.000089 time 0.2467 (0.2512) loss 0.4035 (0.3892) grad_norm 362180.9062 (inf) mem 14543MB +[2023-10-10 07:18:31 simmim_pretrain](main_simmim.py 218): INFO Train: [8/200][6500/6787] eta 0:01:12 lr 0.000090 time 0.2507 (0.2512) loss 0.4013 (0.3892) grad_norm 273559.9375 (inf) mem 14543MB +[2023-10-10 07:19:43 simmim_pretrain](main_simmim.py 228): INFO EPOCH 8 training takes 0:28:25 +[2023-10-10 07:19:45 simmim_pretrain](main_simmim.py 218): INFO Train: [9/200][0/6787] eta 2:30:51 lr 0.000091 time 1.3336 (1.3336) loss 0.3687 (0.3687) grad_norm 269559.4375 (269559.4375) mem 14543MB +[2023-10-10 07:21:50 simmim_pretrain](main_simmim.py 218): INFO Train: [9/200][500/6787] eta 0:26:26 lr 0.000091 time 0.2483 (0.2524) loss 0.4103 (0.3880) grad_norm 234218.1094 (248650.3594) mem 14543MB +[2023-10-10 07:23:55 simmim_pretrain](main_simmim.py 218): INFO Train: [9/200][1000/6787] eta 0:24:15 lr 0.000092 time 0.2495 (0.2516) loss 0.3877 (0.3875) grad_norm 313479.8750 (302919.0625) mem 14543MB +[2023-10-10 07:26:00 simmim_pretrain](main_simmim.py 218): INFO Train: [9/200][1500/6787] eta 0:22:07 lr 0.000093 time 0.2496 (0.2510) loss 0.3852 (0.3871) grad_norm 438994.5938 (334929.5938) mem 14543MB +[2023-10-10 07:28:05 simmim_pretrain](main_simmim.py 218): INFO Train: [9/200][2000/6787] eta 0:20:00 lr 0.000093 time 0.2517 (0.2509) loss 0.3997 (0.3871) grad_norm 341424.4688 (344583.5000) mem 14543MB +[2023-10-10 07:30:10 simmim_pretrain](main_simmim.py 218): INFO Train: [9/200][2500/6787] eta 0:17:55 lr 0.000094 time 0.2499 (0.2508) loss 0.3826 (0.3870) grad_norm 555100.7500 (inf) mem 14543MB +[2023-10-10 07:32:16 simmim_pretrain](main_simmim.py 218): INFO Train: [9/200][3000/6787] eta 0:15:49 lr 0.000095 time 0.2470 (0.2508) loss 0.3847 (0.3868) grad_norm 362904.6875 (inf) mem 14543MB +[2023-10-10 07:34:21 simmim_pretrain](main_simmim.py 218): INFO Train: [9/200][3500/6787] eta 0:13:44 lr 0.000096 time 0.2512 (0.2508) loss 0.4190 (0.3867) grad_norm 335807.7812 (inf) mem 14543MB +[2023-10-10 07:36:26 simmim_pretrain](main_simmim.py 218): INFO Train: [9/200][4000/6787] eta 0:11:38 lr 0.000096 time 0.2588 (0.2507) loss 0.3723 (0.3866) grad_norm 727835.9375 (inf) mem 14543MB +[2023-10-10 07:38:31 simmim_pretrain](main_simmim.py 218): INFO Train: [9/200][4500/6787] eta 0:09:33 lr 0.000097 time 0.2522 (0.2506) loss 0.3831 (0.3865) grad_norm 556983.2500 (inf) mem 14543MB +[2023-10-10 07:40:37 simmim_pretrain](main_simmim.py 218): INFO Train: [9/200][5000/6787] eta 0:07:27 lr 0.000098 time 0.2449 (0.2506) loss 0.3679 (0.3865) grad_norm 215332.0156 (inf) mem 14543MB +[2023-10-10 07:42:42 simmim_pretrain](main_simmim.py 218): INFO Train: [9/200][5500/6787] eta 0:05:22 lr 0.000099 time 0.2553 (0.2507) loss 0.3855 (0.3864) grad_norm 302305.9375 (inf) mem 14543MB +[2023-10-10 07:44:48 simmim_pretrain](main_simmim.py 218): INFO Train: [9/200][6000/6787] eta 0:03:17 lr 0.000099 time 0.2467 (0.2507) loss 0.3639 (0.3863) grad_norm 642916.5625 (inf) mem 14543MB +[2023-10-10 07:46:53 simmim_pretrain](main_simmim.py 218): INFO Train: [9/200][6500/6787] eta 0:01:11 lr 0.000100 time 0.2517 (0.2507) loss 0.3862 (0.3861) grad_norm 245829.7031 (inf) mem 14543MB +[2023-10-10 07:48:05 simmim_pretrain](main_simmim.py 228): INFO EPOCH 9 training takes 0:28:22 +[2023-10-10 07:48:07 simmim_pretrain](main_simmim.py 218): INFO Train: [10/200][0/6787] eta 2:39:31 lr 0.000101 time 1.4102 (1.4102) loss 0.3614 (0.3614) grad_norm 206831.5000 (206831.5000) mem 14543MB +[2023-10-10 07:50:12 simmim_pretrain](main_simmim.py 218): INFO Train: [10/200][500/6787] eta 0:26:29 lr 0.000101 time 0.2537 (0.2528) loss 0.3976 (0.3877) grad_norm 250481.3281 (277964.6875) mem 14543MB +[2023-10-10 07:52:18 simmim_pretrain](main_simmim.py 218): INFO Train: [10/200][1000/6787] eta 0:24:19 lr 0.000102 time 0.2493 (0.2522) loss 0.3950 (0.3866) grad_norm 289378.8438 (261880.9688) mem 14543MB +[2023-10-10 07:54:24 simmim_pretrain](main_simmim.py 218): INFO Train: [10/200][1500/6787] eta 0:22:12 lr 0.000103 time 0.2479 (0.2520) loss 0.3967 (0.3860) grad_norm 238961.7344 (258310.2500) mem 14543MB +[2023-10-10 07:56:29 simmim_pretrain](main_simmim.py 218): INFO Train: [10/200][2000/6787] eta 0:20:05 lr 0.000103 time 0.2485 (0.2519) loss 0.4014 (0.3857) grad_norm 327502.4688 (254304.8438) mem 14543MB +[2023-10-10 07:58:35 simmim_pretrain](main_simmim.py 218): INFO Train: [10/200][2500/6787] eta 0:17:58 lr 0.000104 time 0.2483 (0.2517) loss 0.3639 (0.3853) grad_norm 400637.7812 (281078.4688) mem 14543MB +[2023-10-10 08:00:40 simmim_pretrain](main_simmim.py 218): INFO Train: [10/200][3000/6787] eta 0:15:52 lr 0.000105 time 0.2491 (0.2515) loss 0.3961 (0.3848) grad_norm 352890.6250 (296152.2812) mem 14543MB +[2023-10-10 08:02:46 simmim_pretrain](main_simmim.py 218): INFO Train: [10/200][3500/6787] eta 0:13:46 lr 0.000106 time 0.2568 (0.2514) loss 0.3667 (0.3847) grad_norm 162273.3750 (310088.4688) mem 14543MB +[2023-10-10 08:04:51 simmim_pretrain](main_simmim.py 218): INFO Train: [10/200][4000/6787] eta 0:11:40 lr 0.000106 time 0.2547 (0.2514) loss 0.3828 (0.3848) grad_norm 212811.9688 (inf) mem 14543MB +[2023-10-10 08:06:57 simmim_pretrain](main_simmim.py 218): INFO Train: [10/200][4500/6787] eta 0:09:34 lr 0.000107 time 0.2542 (0.2514) loss 0.3775 (0.3848) grad_norm 204729.8438 (inf) mem 14543MB +[2023-10-10 08:09:02 simmim_pretrain](main_simmim.py 218): INFO Train: [10/200][5000/6787] eta 0:07:29 lr 0.000108 time 0.2515 (0.2513) loss 0.3938 (0.3846) grad_norm 193556.4219 (inf) mem 14543MB +[2023-10-10 08:11:07 simmim_pretrain](main_simmim.py 218): INFO Train: [10/200][5500/6787] eta 0:05:23 lr 0.000109 time 0.2584 (0.2512) loss 0.3777 (0.3845) grad_norm 337832.3438 (inf) mem 14543MB +[2023-10-10 08:13:13 simmim_pretrain](main_simmim.py 218): INFO Train: [10/200][6000/6787] eta 0:03:17 lr 0.000109 time 0.2485 (0.2512) loss 0.3934 (0.3845) grad_norm 202112.8438 (inf) mem 14543MB +[2023-10-10 08:15:18 simmim_pretrain](main_simmim.py 218): INFO Train: [10/200][6500/6787] eta 0:01:12 lr 0.000110 time 0.2473 (0.2511) loss 0.3743 (0.3844) grad_norm 253926.7344 (inf) mem 14543MB +[2023-10-10 08:16:31 simmim_pretrain](main_simmim.py 228): INFO EPOCH 10 training takes 0:28:25 +[2023-10-10 08:16:32 simmim_pretrain](main_simmim.py 218): INFO Train: [11/200][0/6787] eta 2:35:59 lr 0.000110 time 1.3791 (1.3791) loss 0.3652 (0.3652) grad_norm 205525.9062 (205525.9062) mem 14543MB +[2023-10-10 08:18:37 simmim_pretrain](main_simmim.py 218): INFO Train: [11/200][500/6787] eta 0:26:27 lr 0.000111 time 0.2514 (0.2524) loss 0.3697 (0.3828) grad_norm 146882.2188 (211697.1562) mem 14543MB +[2023-10-10 08:20:42 simmim_pretrain](main_simmim.py 218): INFO Train: [11/200][1000/6787] eta 0:24:14 lr 0.000112 time 0.2513 (0.2514) loss 0.3967 (0.3824) grad_norm 592611.6875 (235206.2344) mem 14543MB +[2023-10-10 08:22:47 simmim_pretrain](main_simmim.py 218): INFO Train: [11/200][1500/6787] eta 0:22:06 lr 0.000113 time 0.2457 (0.2510) loss 0.3838 (0.3819) grad_norm 375908.1250 (275599.5938) mem 14543MB +[2023-10-10 08:24:53 simmim_pretrain](main_simmim.py 218): INFO Train: [11/200][2000/6787] eta 0:20:00 lr 0.000113 time 0.2545 (0.2509) loss 0.3628 (0.3817) grad_norm 183156.7656 (281917.5625) mem 14543MB +[2023-10-10 08:26:58 simmim_pretrain](main_simmim.py 218): INFO Train: [11/200][2500/6787] eta 0:17:55 lr 0.000114 time 0.2504 (0.2508) loss 0.3877 (0.3815) grad_norm 225173.9062 (291856.5938) mem 14543MB +[2023-10-10 08:29:03 simmim_pretrain](main_simmim.py 218): INFO Train: [11/200][3000/6787] eta 0:15:49 lr 0.000115 time 0.2463 (0.2508) loss 0.3689 (0.3813) grad_norm 298418.0625 (315759.6250) mem 14543MB +[2023-10-10 08:31:09 simmim_pretrain](main_simmim.py 218): INFO Train: [11/200][3500/6787] eta 0:13:44 lr 0.000116 time 0.2540 (0.2508) loss 0.3907 (0.3813) grad_norm 367320.2188 (inf) mem 14543MB +[2023-10-10 08:33:14 simmim_pretrain](main_simmim.py 218): INFO Train: [11/200][4000/6787] eta 0:11:38 lr 0.000116 time 0.2466 (0.2508) loss 0.3818 (0.3813) grad_norm 263023.5312 (inf) mem 14543MB +[2023-10-10 08:35:19 simmim_pretrain](main_simmim.py 218): INFO Train: [11/200][4500/6787] eta 0:09:33 lr 0.000117 time 0.2502 (0.2508) loss 0.3637 (0.3813) grad_norm 244497.2344 (inf) mem 14543MB +[2023-10-10 08:37:25 simmim_pretrain](main_simmim.py 218): INFO Train: [11/200][5000/6787] eta 0:07:28 lr 0.000118 time 0.2505 (0.2508) loss 0.3614 (0.3814) grad_norm 191996.0156 (inf) mem 14543MB +[2023-10-10 08:39:30 simmim_pretrain](main_simmim.py 218): INFO Train: [11/200][5500/6787] eta 0:05:22 lr 0.000119 time 0.2514 (0.2508) loss 0.3688 (0.3812) grad_norm 299578.2500 (inf) mem 14543MB +[2023-10-10 08:41:36 simmim_pretrain](main_simmim.py 218): INFO Train: [11/200][6000/6787] eta 0:03:17 lr 0.000119 time 0.2511 (0.2508) loss 0.3755 (0.3811) grad_norm 296311.7188 (inf) mem 14543MB +[2023-10-10 08:43:41 simmim_pretrain](main_simmim.py 218): INFO Train: [11/200][6500/6787] eta 0:01:11 lr 0.000120 time 0.2488 (0.2509) loss 0.3962 (0.3810) grad_norm 422952.8750 (inf) mem 14543MB +[2023-10-10 08:44:54 simmim_pretrain](main_simmim.py 228): INFO EPOCH 11 training takes 0:28:23 +[2023-10-10 08:44:55 simmim_pretrain](main_simmim.py 218): INFO Train: [12/200][0/6787] eta 2:39:31 lr 0.000120 time 1.4102 (1.4102) loss 0.3745 (0.3745) grad_norm 484865.1562 (484865.1562) mem 14543MB +[2023-10-10 08:47:01 simmim_pretrain](main_simmim.py 218): INFO Train: [12/200][500/6787] eta 0:26:31 lr 0.000121 time 0.2541 (0.2532) loss 0.3719 (0.3793) grad_norm 470115.7188 (inf) mem 14543MB +[2023-10-10 08:49:06 simmim_pretrain](main_simmim.py 218): INFO Train: [12/200][1000/6787] eta 0:24:19 lr 0.000122 time 0.2529 (0.2522) loss 0.3842 (0.3792) grad_norm 764396.7500 (inf) mem 14543MB +[2023-10-10 08:51:12 simmim_pretrain](main_simmim.py 218): INFO Train: [12/200][1500/6787] eta 0:22:12 lr 0.000123 time 0.2485 (0.2520) loss 0.3747 (0.3795) grad_norm 328970.7188 (inf) mem 14543MB +[2023-10-10 08:53:18 simmim_pretrain](main_simmim.py 218): INFO Train: [12/200][2000/6787] eta 0:20:05 lr 0.000123 time 0.2571 (0.2518) loss 0.3731 (0.3796) grad_norm 670594.7500 (inf) mem 14543MB +[2023-10-10 08:55:23 simmim_pretrain](main_simmim.py 218): INFO Train: [12/200][2500/6787] eta 0:17:58 lr 0.000124 time 0.2509 (0.2516) loss 0.3944 (0.3794) grad_norm 296960.9688 (inf) mem 14543MB +[2023-10-10 08:57:29 simmim_pretrain](main_simmim.py 218): INFO Train: [12/200][3000/6787] eta 0:15:52 lr 0.000125 time 0.2590 (0.2514) loss 0.3735 (0.3791) grad_norm 424444.0000 (inf) mem 14543MB +[2023-10-10 08:59:34 simmim_pretrain](main_simmim.py 218): INFO Train: [12/200][3500/6787] eta 0:13:46 lr 0.000126 time 0.2523 (0.2514) loss 0.3920 (0.3791) grad_norm 327560.0312 (inf) mem 14543MB +[2023-10-10 09:01:40 simmim_pretrain](main_simmim.py 218): INFO Train: [12/200][4000/6787] eta 0:11:40 lr 0.000126 time 0.2525 (0.2514) loss 0.3766 (0.3790) grad_norm 415809.1875 (inf) mem 14543MB +[2023-10-10 09:03:45 simmim_pretrain](main_simmim.py 218): INFO Train: [12/200][4500/6787] eta 0:09:34 lr 0.000127 time 0.2504 (0.2514) loss 0.3872 (0.3789) grad_norm 255386.7344 (inf) mem 14543MB +[2023-10-10 09:05:51 simmim_pretrain](main_simmim.py 218): INFO Train: [12/200][5000/6787] eta 0:07:29 lr 0.000128 time 0.2589 (0.2513) loss 0.3900 (0.3788) grad_norm 466542.5000 (inf) mem 14543MB +[2023-10-10 09:07:56 simmim_pretrain](main_simmim.py 218): INFO Train: [12/200][5500/6787] eta 0:05:23 lr 0.000128 time 0.2495 (0.2513) loss 0.3769 (0.3787) grad_norm 407039.7188 (inf) mem 14543MB +[2023-10-10 09:10:02 simmim_pretrain](main_simmim.py 218): INFO Train: [12/200][6000/6787] eta 0:03:17 lr 0.000129 time 0.2507 (0.2513) loss 0.3963 (0.3787) grad_norm 707011.6250 (inf) mem 14543MB +[2023-10-10 09:12:07 simmim_pretrain](main_simmim.py 218): INFO Train: [12/200][6500/6787] eta 0:01:12 lr 0.000130 time 0.2570 (0.2513) loss 0.3745 (0.3787) grad_norm 349811.6562 (inf) mem 14543MB +[2023-10-10 09:13:20 simmim_pretrain](main_simmim.py 228): INFO EPOCH 12 training takes 0:28:25 +[2023-10-10 09:13:21 simmim_pretrain](main_simmim.py 218): INFO Train: [13/200][0/6787] eta 2:45:14 lr 0.000130 time 1.4608 (1.4608) loss 0.4052 (0.4052) grad_norm 206445.3281 (206445.3281) mem 14543MB +[2023-10-10 09:15:27 simmim_pretrain](main_simmim.py 218): INFO Train: [13/200][500/6787] eta 0:26:32 lr 0.000131 time 0.2471 (0.2533) loss 0.3712 (0.3775) grad_norm 356742.4375 (400614.3125) mem 14543MB +[2023-10-10 09:17:32 simmim_pretrain](main_simmim.py 218): INFO Train: [13/200][1000/6787] eta 0:24:18 lr 0.000132 time 0.2492 (0.2521) loss 0.3737 (0.3769) grad_norm 364778.0938 (404376.3125) mem 14543MB +[2023-10-10 09:19:38 simmim_pretrain](main_simmim.py 218): INFO Train: [13/200][1500/6787] eta 0:22:10 lr 0.000133 time 0.2487 (0.2516) loss 0.4008 (0.3768) grad_norm 237716.0156 (inf) mem 14543MB +[2023-10-10 09:21:43 simmim_pretrain](main_simmim.py 218): INFO Train: [13/200][2000/6787] eta 0:20:03 lr 0.000133 time 0.2489 (0.2514) loss 0.3800 (0.3775) grad_norm 351537.3125 (inf) mem 14543MB +[2023-10-10 09:23:49 simmim_pretrain](main_simmim.py 218): INFO Train: [13/200][2500/6787] eta 0:17:57 lr 0.000134 time 0.2520 (0.2514) loss 0.3871 (0.3779) grad_norm 112729.3516 (inf) mem 14543MB +[2023-10-10 09:25:54 simmim_pretrain](main_simmim.py 218): INFO Train: [13/200][3000/6787] eta 0:15:51 lr 0.000135 time 0.2508 (0.2513) loss 0.4020 (0.3781) grad_norm 184437.5469 (inf) mem 14543MB +[2023-10-10 09:27:59 simmim_pretrain](main_simmim.py 218): INFO Train: [13/200][3500/6787] eta 0:13:45 lr 0.000135 time 0.2513 (0.2512) loss 0.3640 (0.3782) grad_norm 221889.7969 (inf) mem 14543MB +[2023-10-10 09:30:05 simmim_pretrain](main_simmim.py 218): INFO Train: [13/200][4000/6787] eta 0:11:39 lr 0.000136 time 0.2492 (0.2511) loss 0.3579 (0.3780) grad_norm 317096.5000 (inf) mem 14543MB +[2023-10-10 09:32:10 simmim_pretrain](main_simmim.py 218): INFO Train: [13/200][4500/6787] eta 0:09:34 lr 0.000137 time 0.2519 (0.2510) loss 0.3949 (0.3779) grad_norm 290177.7188 (inf) mem 14543MB +[2023-10-10 09:34:15 simmim_pretrain](main_simmim.py 218): INFO Train: [13/200][5000/6787] eta 0:07:28 lr 0.000138 time 0.2482 (0.2510) loss 0.3680 (0.3778) grad_norm 273392.9375 (inf) mem 14543MB +[2023-10-10 09:36:21 simmim_pretrain](main_simmim.py 218): INFO Train: [13/200][5500/6787] eta 0:05:23 lr 0.000138 time 0.2463 (0.2510) loss 0.3673 (0.3777) grad_norm 167449.9688 (inf) mem 14543MB +[2023-10-10 09:38:26 simmim_pretrain](main_simmim.py 218): INFO Train: [13/200][6000/6787] eta 0:03:17 lr 0.000139 time 0.2525 (0.2510) loss 0.3810 (0.3775) grad_norm 343543.6875 (inf) mem 14543MB +[2023-10-10 09:40:32 simmim_pretrain](main_simmim.py 218): INFO Train: [13/200][6500/6787] eta 0:01:12 lr 0.000140 time 0.2468 (0.2510) loss 0.3784 (0.3774) grad_norm 397498.8125 (inf) mem 14543MB +[2023-10-10 09:41:44 simmim_pretrain](main_simmim.py 228): INFO EPOCH 13 training takes 0:28:24 +[2023-10-10 09:41:45 simmim_pretrain](main_simmim.py 218): INFO Train: [14/200][0/6787] eta 2:32:41 lr 0.000140 time 1.3499 (1.3499) loss 0.3790 (0.3790) grad_norm 157906.1406 (157906.1406) mem 14543MB +[2023-10-10 09:43:51 simmim_pretrain](main_simmim.py 218): INFO Train: [14/200][500/6787] eta 0:26:27 lr 0.000141 time 0.2483 (0.2526) loss 0.3835 (0.3764) grad_norm 265614.0000 (348892.8750) mem 14543MB +[2023-10-10 09:45:56 simmim_pretrain](main_simmim.py 218): INFO Train: [14/200][1000/6787] eta 0:24:16 lr 0.000142 time 0.2471 (0.2517) loss 0.3623 (0.3762) grad_norm 458638.5312 (376636.5938) mem 14543MB +[2023-10-10 09:48:01 simmim_pretrain](main_simmim.py 218): INFO Train: [14/200][1500/6787] eta 0:22:09 lr 0.000142 time 0.2499 (0.2515) loss 0.3736 (0.3758) grad_norm 315340.2812 (376060.9062) mem 14543MB +[2023-10-10 09:50:07 simmim_pretrain](main_simmim.py 218): INFO Train: [14/200][2000/6787] eta 0:20:03 lr 0.000143 time 0.2490 (0.2514) loss 0.3591 (0.3756) grad_norm 250537.8125 (inf) mem 14543MB +[2023-10-10 09:52:12 simmim_pretrain](main_simmim.py 218): INFO Train: [14/200][2500/6787] eta 0:17:57 lr 0.000144 time 0.2508 (0.2513) loss 0.3965 (0.3757) grad_norm 534742.6875 (inf) mem 14543MB +[2023-10-10 09:54:18 simmim_pretrain](main_simmim.py 218): INFO Train: [14/200][3000/6787] eta 0:15:51 lr 0.000145 time 0.2540 (0.2513) loss 0.3713 (0.3757) grad_norm 177548.0938 (inf) mem 14543MB +[2023-10-10 09:56:24 simmim_pretrain](main_simmim.py 218): INFO Train: [14/200][3500/6787] eta 0:13:45 lr 0.000145 time 0.2512 (0.2513) loss 0.3679 (0.3759) grad_norm 434317.5312 (inf) mem 14543MB +[2023-10-10 09:58:29 simmim_pretrain](main_simmim.py 218): INFO Train: [14/200][4000/6787] eta 0:11:40 lr 0.000146 time 0.2523 (0.2513) loss 0.3780 (0.3761) grad_norm 392255.7500 (inf) mem 14543MB +[2023-10-10 10:00:35 simmim_pretrain](main_simmim.py 218): INFO Train: [14/200][4500/6787] eta 0:09:34 lr 0.000147 time 0.2472 (0.2513) loss 0.3873 (0.3763) grad_norm 267664.6250 (inf) mem 14543MB +[2023-10-10 10:02:40 simmim_pretrain](main_simmim.py 218): INFO Train: [14/200][5000/6787] eta 0:07:28 lr 0.000148 time 0.2499 (0.2512) loss 0.3720 (0.3765) grad_norm 205618.3750 (inf) mem 14543MB +[2023-10-10 10:04:45 simmim_pretrain](main_simmim.py 218): INFO Train: [14/200][5500/6787] eta 0:05:23 lr 0.000148 time 0.2500 (0.2511) loss 0.3801 (0.3765) grad_norm 323696.4062 (inf) mem 14543MB +[2023-10-10 10:06:51 simmim_pretrain](main_simmim.py 218): INFO Train: [14/200][6000/6787] eta 0:03:17 lr 0.000149 time 0.2513 (0.2511) loss 0.3932 (0.3765) grad_norm 334242.9375 (inf) mem 14543MB +[2023-10-10 10:08:56 simmim_pretrain](main_simmim.py 218): INFO Train: [14/200][6500/6787] eta 0:01:12 lr 0.000150 time 0.2473 (0.2511) loss 0.3664 (0.3765) grad_norm 168673.9531 (inf) mem 14543MB +[2023-10-10 10:10:09 simmim_pretrain](main_simmim.py 228): INFO EPOCH 14 training takes 0:28:24 +[2023-10-10 10:10:10 simmim_pretrain](main_simmim.py 218): INFO Train: [15/200][0/6787] eta 2:28:00 lr 0.000150 time 1.3084 (1.3084) loss 0.3545 (0.3545) grad_norm 348921.0938 (348921.0938) mem 14543MB +[2023-10-10 10:12:15 simmim_pretrain](main_simmim.py 218): INFO Train: [15/200][500/6787] eta 0:26:29 lr 0.000151 time 0.2513 (0.2528) loss 0.3871 (0.3747) grad_norm 249418.1562 (388708.0938) mem 14543MB +[2023-10-10 10:14:21 simmim_pretrain](main_simmim.py 218): INFO Train: [15/200][1000/6787] eta 0:24:15 lr 0.000152 time 0.2491 (0.2516) loss 0.3828 (0.3745) grad_norm 672871.3750 (378720.8125) mem 14543MB +[2023-10-10 10:16:25 simmim_pretrain](main_simmim.py 218): INFO Train: [15/200][1500/6787] eta 0:22:07 lr 0.000152 time 0.2469 (0.2510) loss 0.3955 (0.3745) grad_norm 569593.0000 (408535.6250) mem 14543MB +[2023-10-10 10:18:31 simmim_pretrain](main_simmim.py 218): INFO Train: [15/200][2000/6787] eta 0:20:01 lr 0.000153 time 0.2551 (0.2509) loss 0.3684 (0.3744) grad_norm 611606.6250 (433148.2500) mem 14543MB +[2023-10-10 10:20:36 simmim_pretrain](main_simmim.py 218): INFO Train: [15/200][2500/6787] eta 0:17:55 lr 0.000154 time 0.2512 (0.2509) loss 0.3799 (0.3746) grad_norm 244301.2969 (inf) mem 14543MB +[2023-10-10 10:22:42 simmim_pretrain](main_simmim.py 218): INFO Train: [15/200][3000/6787] eta 0:15:50 lr 0.000155 time 0.2497 (0.2509) loss 0.3637 (0.3751) grad_norm 168420.6406 (inf) mem 14543MB +[2023-10-10 10:24:47 simmim_pretrain](main_simmim.py 218): INFO Train: [15/200][3500/6787] eta 0:13:44 lr 0.000155 time 0.2466 (0.2509) loss 0.4175 (0.3752) grad_norm 155449.8438 (inf) mem 14543MB +[2023-10-10 10:26:52 simmim_pretrain](main_simmim.py 218): INFO Train: [15/200][4000/6787] eta 0:11:38 lr 0.000156 time 0.2534 (0.2507) loss 0.3790 (0.3754) grad_norm 273717.5938 (inf) mem 14543MB +[2023-10-10 10:28:57 simmim_pretrain](main_simmim.py 218): INFO Train: [15/200][4500/6787] eta 0:09:33 lr 0.000157 time 0.2525 (0.2507) loss 0.3594 (0.3755) grad_norm 463704.3125 (inf) mem 14543MB +[2023-10-10 10:31:03 simmim_pretrain](main_simmim.py 218): INFO Train: [15/200][5000/6787] eta 0:07:28 lr 0.000158 time 0.2519 (0.2507) loss 0.3433 (0.3754) grad_norm 291185.7812 (inf) mem 14543MB +[2023-10-10 10:33:08 simmim_pretrain](main_simmim.py 218): INFO Train: [15/200][5500/6787] eta 0:05:22 lr 0.000158 time 0.2596 (0.2508) loss 0.3643 (0.3753) grad_norm 334836.7500 (inf) mem 14543MB +[2023-10-10 10:35:14 simmim_pretrain](main_simmim.py 218): INFO Train: [15/200][6000/6787] eta 0:03:17 lr 0.000159 time 0.2543 (0.2508) loss 0.3888 (0.3753) grad_norm 485714.3750 (inf) mem 14543MB +[2023-10-10 10:37:19 simmim_pretrain](main_simmim.py 218): INFO Train: [15/200][6500/6787] eta 0:01:11 lr 0.000160 time 0.2473 (0.2508) loss 0.3791 (0.3752) grad_norm 241484.3750 (inf) mem 14543MB +[2023-10-10 10:38:31 simmim_pretrain](main_simmim.py 228): INFO EPOCH 15 training takes 0:28:22 +[2023-10-10 10:38:33 simmim_pretrain](main_simmim.py 218): INFO Train: [16/200][0/6787] eta 2:42:39 lr 0.000160 time 1.4380 (1.4380) loss 0.3894 (0.3894) grad_norm 362608.6875 (362608.6875) mem 14543MB +[2023-10-10 10:40:38 simmim_pretrain](main_simmim.py 218): INFO Train: [16/200][500/6787] eta 0:26:30 lr 0.000161 time 0.2467 (0.2531) loss 0.3715 (0.3739) grad_norm 204073.4062 (389797.8750) mem 14543MB +[2023-10-10 10:42:44 simmim_pretrain](main_simmim.py 218): INFO Train: [16/200][1000/6787] eta 0:24:19 lr 0.000162 time 0.2531 (0.2522) loss 0.4047 (0.3739) grad_norm 298060.7812 (390827.1250) mem 14543MB +[2023-10-10 10:44:50 simmim_pretrain](main_simmim.py 218): INFO Train: [16/200][1500/6787] eta 0:22:11 lr 0.000162 time 0.2463 (0.2519) loss 0.3602 (0.3742) grad_norm 406142.9375 (413585.6250) mem 14543MB +[2023-10-10 10:46:55 simmim_pretrain](main_simmim.py 218): INFO Train: [16/200][2000/6787] eta 0:20:05 lr 0.000163 time 0.2520 (0.2518) loss 0.3567 (0.3739) grad_norm 382483.0938 (420028.4688) mem 14543MB +[2023-10-10 10:49:01 simmim_pretrain](main_simmim.py 218): INFO Train: [16/200][2500/6787] eta 0:17:58 lr 0.000164 time 0.2527 (0.2516) loss 0.3507 (0.3739) grad_norm 302461.8438 (inf) mem 14543MB +[2023-10-10 10:51:06 simmim_pretrain](main_simmim.py 218): INFO Train: [16/200][3000/6787] eta 0:15:52 lr 0.000165 time 0.2517 (0.2515) loss 0.3827 (0.3738) grad_norm 278482.3750 (inf) mem 14543MB +[2023-10-10 10:53:12 simmim_pretrain](main_simmim.py 218): INFO Train: [16/200][3500/6787] eta 0:13:46 lr 0.000165 time 0.2530 (0.2515) loss 0.3645 (0.3738) grad_norm 391500.1875 (inf) mem 14543MB +[2023-10-10 10:55:18 simmim_pretrain](main_simmim.py 218): INFO Train: [16/200][4000/6787] eta 0:11:40 lr 0.000166 time 0.2518 (0.2515) loss 0.3427 (0.3739) grad_norm 327112.0312 (inf) mem 14543MB +[2023-10-10 10:57:23 simmim_pretrain](main_simmim.py 218): INFO Train: [16/200][4500/6787] eta 0:09:35 lr 0.000167 time 0.2502 (0.2514) loss 0.3659 (0.3740) grad_norm 544566.3750 (inf) mem 14543MB +[2023-10-10 10:59:28 simmim_pretrain](main_simmim.py 218): INFO Train: [16/200][5000/6787] eta 0:07:29 lr 0.000168 time 0.2477 (0.2514) loss 0.3801 (0.3740) grad_norm 525298.6875 (inf) mem 14543MB +[2023-10-10 11:01:34 simmim_pretrain](main_simmim.py 218): INFO Train: [16/200][5500/6787] eta 0:05:23 lr 0.000168 time 0.2459 (0.2513) loss 0.3583 (0.3740) grad_norm 315833.1562 (inf) mem 14543MB +[2023-10-10 11:03:39 simmim_pretrain](main_simmim.py 218): INFO Train: [16/200][6000/6787] eta 0:03:17 lr 0.000169 time 0.2518 (0.2512) loss 0.3775 (0.3740) grad_norm 313538.5000 (inf) mem 14543MB +[2023-10-10 11:05:45 simmim_pretrain](main_simmim.py 218): INFO Train: [16/200][6500/6787] eta 0:01:12 lr 0.000170 time 0.2450 (0.2512) loss 0.3825 (0.3740) grad_norm 463360.4062 (inf) mem 14543MB +[2023-10-10 11:06:57 simmim_pretrain](main_simmim.py 228): INFO EPOCH 16 training takes 0:28:25 +[2023-10-10 11:06:58 simmim_pretrain](main_simmim.py 218): INFO Train: [17/200][0/6787] eta 2:32:04 lr 0.000170 time 1.3444 (1.3444) loss 0.3684 (0.3684) grad_norm 429457.3125 (429457.3125) mem 14543MB +[2023-10-10 11:09:04 simmim_pretrain](main_simmim.py 218): INFO Train: [17/200][500/6787] eta 0:26:28 lr 0.000171 time 0.2518 (0.2527) loss 0.3724 (0.3734) grad_norm 277655.5625 (435504.7188) mem 14543MB +[2023-10-10 11:11:09 simmim_pretrain](main_simmim.py 218): INFO Train: [17/200][1000/6787] eta 0:24:16 lr 0.000172 time 0.2500 (0.2516) loss 0.3500 (0.3734) grad_norm 880038.3750 (430012.0625) mem 14543MB +[2023-10-10 11:13:14 simmim_pretrain](main_simmim.py 218): INFO Train: [17/200][1500/6787] eta 0:22:08 lr 0.000172 time 0.2460 (0.2512) loss 0.3756 (0.3735) grad_norm 368309.5938 (427902.3750) mem 14543MB +[2023-10-10 11:15:20 simmim_pretrain](main_simmim.py 218): INFO Train: [17/200][2000/6787] eta 0:20:01 lr 0.000173 time 0.2537 (0.2511) loss 0.3495 (0.3735) grad_norm 314261.6875 (inf) mem 14543MB +[2023-10-10 11:17:25 simmim_pretrain](main_simmim.py 218): INFO Train: [17/200][2500/6787] eta 0:17:56 lr 0.000174 time 0.2474 (0.2511) loss 0.3671 (0.3736) grad_norm 403794.0625 (inf) mem 14543MB +[2023-10-10 11:19:31 simmim_pretrain](main_simmim.py 218): INFO Train: [17/200][3000/6787] eta 0:15:50 lr 0.000175 time 0.2529 (0.2511) loss 0.3741 (0.3736) grad_norm 328522.8438 (inf) mem 14543MB +[2023-10-10 11:21:36 simmim_pretrain](main_simmim.py 218): INFO Train: [17/200][3500/6787] eta 0:13:45 lr 0.000175 time 0.2515 (0.2511) loss 0.3539 (0.3736) grad_norm 207432.8125 (inf) mem 14543MB +[2023-10-10 11:23:41 simmim_pretrain](main_simmim.py 218): INFO Train: [17/200][4000/6787] eta 0:11:39 lr 0.000176 time 0.2533 (0.2510) loss 0.3923 (0.3739) grad_norm 253584.5469 (inf) mem 14543MB +[2023-10-10 11:25:47 simmim_pretrain](main_simmim.py 218): INFO Train: [17/200][4500/6787] eta 0:09:34 lr 0.000177 time 0.2494 (0.2510) loss 0.3326 (0.3741) grad_norm 203535.0469 (inf) mem 14543MB +[2023-10-10 11:27:53 simmim_pretrain](main_simmim.py 218): INFO Train: [17/200][5000/6787] eta 0:07:28 lr 0.000177 time 0.2547 (0.2511) loss 0.3751 (0.3742) grad_norm 213703.9531 (inf) mem 14543MB +[2023-10-10 11:29:59 simmim_pretrain](main_simmim.py 218): INFO Train: [17/200][5500/6787] eta 0:05:23 lr 0.000178 time 0.2507 (0.2511) loss 0.3826 (0.3743) grad_norm 269971.9375 (inf) mem 14543MB +[2023-10-10 11:32:05 simmim_pretrain](main_simmim.py 218): INFO Train: [17/200][6000/6787] eta 0:03:17 lr 0.000179 time 0.2462 (0.2512) loss 0.3550 (0.3742) grad_norm 183230.6094 (inf) mem 14543MB +[2023-10-10 11:34:10 simmim_pretrain](main_simmim.py 218): INFO Train: [17/200][6500/6787] eta 0:01:12 lr 0.000180 time 0.2534 (0.2512) loss 0.3435 (0.3741) grad_norm 274625.6250 (inf) mem 14543MB +[2023-10-10 11:35:23 simmim_pretrain](main_simmim.py 228): INFO EPOCH 17 training takes 0:28:25 +[2023-10-10 11:35:24 simmim_pretrain](main_simmim.py 218): INFO Train: [18/200][0/6787] eta 2:32:31 lr 0.000180 time 1.3484 (1.3484) loss 0.3527 (0.3527) grad_norm 467103.2188 (467103.2188) mem 14543MB +[2023-10-10 11:37:30 simmim_pretrain](main_simmim.py 218): INFO Train: [18/200][500/6787] eta 0:26:32 lr 0.000181 time 0.2516 (0.2534) loss 0.3815 (0.3724) grad_norm 338192.0312 (373958.0938) mem 14543MB +[2023-10-10 11:39:36 simmim_pretrain](main_simmim.py 218): INFO Train: [18/200][1000/6787] eta 0:24:22 lr 0.000182 time 0.2586 (0.2527) loss 0.3758 (0.3723) grad_norm 247274.8750 (inf) mem 14543MB +[2023-10-10 11:41:42 simmim_pretrain](main_simmim.py 218): INFO Train: [18/200][1500/6787] eta 0:22:14 lr 0.000182 time 0.2521 (0.2524) loss 0.3602 (0.3725) grad_norm 319412.8750 (inf) mem 14543MB +[2023-10-10 11:43:48 simmim_pretrain](main_simmim.py 218): INFO Train: [18/200][2000/6787] eta 0:20:07 lr 0.000183 time 0.2493 (0.2523) loss 0.3636 (0.3723) grad_norm 288363.0625 (inf) mem 14543MB +[2023-10-10 11:45:53 simmim_pretrain](main_simmim.py 218): INFO Train: [18/200][2500/6787] eta 0:18:00 lr 0.000184 time 0.2502 (0.2520) loss 0.3819 (0.3723) grad_norm 243989.2812 (inf) mem 14543MB +[2023-10-10 11:47:59 simmim_pretrain](main_simmim.py 218): INFO Train: [18/200][3000/6787] eta 0:15:54 lr 0.000184 time 0.2545 (0.2519) loss 0.3540 (0.3725) grad_norm 432844.7188 (inf) mem 14543MB +[2023-10-10 11:50:05 simmim_pretrain](main_simmim.py 218): INFO Train: [18/200][3500/6787] eta 0:13:48 lr 0.000185 time 0.2486 (0.2519) loss 0.3596 (0.3725) grad_norm 425532.1562 (inf) mem 14543MB +[2023-10-10 11:52:12 simmim_pretrain](main_simmim.py 218): INFO Train: [18/200][4000/6787] eta 0:11:42 lr 0.000186 time 0.2540 (0.2521) loss 0.3539 (0.3723) grad_norm 317772.3438 (inf) mem 14543MB +[2023-10-10 11:54:19 simmim_pretrain](main_simmim.py 218): INFO Train: [18/200][4500/6787] eta 0:09:37 lr 0.000187 time 0.2525 (0.2524) loss 0.3680 (0.3724) grad_norm 671943.8750 (inf) mem 14543MB +[2023-10-10 11:56:26 simmim_pretrain](main_simmim.py 218): INFO Train: [18/200][5000/6787] eta 0:07:31 lr 0.000187 time 0.2496 (0.2526) loss 0.3616 (0.3725) grad_norm 406137.6250 (inf) mem 14543MB +[2023-10-10 11:58:33 simmim_pretrain](main_simmim.py 218): INFO Train: [18/200][5500/6787] eta 0:05:25 lr 0.000188 time 0.2542 (0.2527) loss 0.3831 (0.3725) grad_norm 449747.0938 (inf) mem 14543MB +[2023-10-10 12:00:40 simmim_pretrain](main_simmim.py 218): INFO Train: [18/200][6000/6787] eta 0:03:19 lr 0.000189 time 0.2527 (0.2529) loss 0.3725 (0.3727) grad_norm 172115.6719 (inf) mem 14543MB +[2023-10-10 12:02:48 simmim_pretrain](main_simmim.py 218): INFO Train: [18/200][6500/6787] eta 0:01:12 lr 0.000190 time 0.2593 (0.2530) loss 0.3618 (0.3728) grad_norm 194137.2812 (inf) mem 14543MB +[2023-10-10 12:04:01 simmim_pretrain](main_simmim.py 228): INFO EPOCH 18 training takes 0:28:38 +[2023-10-10 12:04:02 simmim_pretrain](main_simmim.py 218): INFO Train: [19/200][0/6787] eta 2:32:05 lr 0.000190 time 1.3446 (1.3446) loss 0.3613 (0.3613) grad_norm 464952.2188 (464952.2188) mem 14543MB +[2023-10-10 12:06:08 simmim_pretrain](main_simmim.py 218): INFO Train: [19/200][500/6787] eta 0:26:31 lr 0.000191 time 0.2536 (0.2532) loss 0.3670 (0.3738) grad_norm 212797.4062 (246745.7656) mem 14543MB +[2023-10-10 12:08:13 simmim_pretrain](main_simmim.py 218): INFO Train: [19/200][1000/6787] eta 0:24:17 lr 0.000192 time 0.2521 (0.2519) loss 0.3634 (0.3736) grad_norm 267791.7188 (266371.5312) mem 14543MB +[2023-10-10 12:10:19 simmim_pretrain](main_simmim.py 218): INFO Train: [19/200][1500/6787] eta 0:22:11 lr 0.000192 time 0.2545 (0.2518) loss 0.3799 (0.3732) grad_norm 311535.1562 (294528.4062) mem 14543MB +[2023-10-10 12:12:25 simmim_pretrain](main_simmim.py 218): INFO Train: [19/200][2000/6787] eta 0:20:05 lr 0.000193 time 0.2470 (0.2518) loss 0.3873 (0.3727) grad_norm 849710.6875 (328538.2188) mem 14543MB +[2023-10-10 12:14:31 simmim_pretrain](main_simmim.py 218): INFO Train: [19/200][2500/6787] eta 0:17:59 lr 0.000194 time 0.2561 (0.2519) loss 0.3572 (0.3726) grad_norm 539647.1875 (348918.1562) mem 14543MB +[2023-10-10 12:16:38 simmim_pretrain](main_simmim.py 218): INFO Train: [19/200][3000/6787] eta 0:15:54 lr 0.000194 time 0.2559 (0.2521) loss 0.3821 (0.3723) grad_norm 341284.0938 (inf) mem 14543MB +[2023-10-10 12:18:44 simmim_pretrain](main_simmim.py 218): INFO Train: [19/200][3500/6787] eta 0:13:49 lr 0.000195 time 0.2488 (0.2522) loss 0.3678 (0.3724) grad_norm 221480.1094 (inf) mem 14543MB +[2023-10-10 12:20:51 simmim_pretrain](main_simmim.py 218): INFO Train: [19/200][4000/6787] eta 0:11:43 lr 0.000196 time 0.2517 (0.2524) loss 0.3748 (0.3727) grad_norm 328909.1875 (inf) mem 14543MB +[2023-10-10 12:22:58 simmim_pretrain](main_simmim.py 218): INFO Train: [19/200][4500/6787] eta 0:09:37 lr 0.000197 time 0.2564 (0.2526) loss 0.3677 (0.3729) grad_norm 413293.4375 (inf) mem 14543MB +[2023-10-10 12:25:06 simmim_pretrain](main_simmim.py 218): INFO Train: [19/200][5000/6787] eta 0:07:32 lr 0.000197 time 0.2556 (0.2530) loss 0.3593 (0.3730) grad_norm 174848.3281 (inf) mem 14543MB +[2023-10-10 12:27:14 simmim_pretrain](main_simmim.py 218): INFO Train: [19/200][5500/6787] eta 0:05:25 lr 0.000198 time 0.2559 (0.2533) loss 0.3619 (0.3731) grad_norm 319927.0625 (inf) mem 14543MB +[2023-10-10 12:29:22 simmim_pretrain](main_simmim.py 218): INFO Train: [19/200][6000/6787] eta 0:03:19 lr 0.000199 time 0.2542 (0.2535) loss 0.3671 (0.3729) grad_norm 284328.2812 (inf) mem 14543MB +[2023-10-10 12:31:30 simmim_pretrain](main_simmim.py 218): INFO Train: [19/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2496 (0.2536) loss 0.3611 (0.3728) grad_norm 325417.2188 (inf) mem 14543MB +[2023-10-10 12:32:43 simmim_pretrain](main_simmim.py 228): INFO EPOCH 19 training takes 0:28:41 +[2023-10-10 12:32:44 simmim_pretrain](main_simmim.py 218): INFO Train: [20/200][0/6787] eta 2:29:16 lr 0.000200 time 1.3197 (1.3197) loss 0.3926 (0.3926) grad_norm 302473.0938 (302473.0938) mem 14543MB +[2023-10-10 12:34:50 simmim_pretrain](main_simmim.py 218): INFO Train: [20/200][500/6787] eta 0:26:31 lr 0.000200 time 0.2449 (0.2531) loss 0.3911 (0.3718) grad_norm 356058.3438 (379502.5000) mem 14543MB +[2023-10-10 12:36:56 simmim_pretrain](main_simmim.py 218): INFO Train: [20/200][1000/6787] eta 0:24:20 lr 0.000200 time 0.2467 (0.2524) loss 0.3344 (0.3719) grad_norm 461852.4062 (inf) mem 14543MB +[2023-10-10 12:39:02 simmim_pretrain](main_simmim.py 218): INFO Train: [20/200][1500/6787] eta 0:22:14 lr 0.000200 time 0.2537 (0.2524) loss 0.3749 (0.3719) grad_norm 966050.7500 (inf) mem 14543MB +[2023-10-10 12:41:08 simmim_pretrain](main_simmim.py 218): INFO Train: [20/200][2000/6787] eta 0:20:08 lr 0.000200 time 0.2518 (0.2524) loss 0.3607 (0.3717) grad_norm 371261.0938 (inf) mem 14543MB +[2023-10-10 12:43:14 simmim_pretrain](main_simmim.py 218): INFO Train: [20/200][2500/6787] eta 0:18:00 lr 0.000200 time 0.2475 (0.2521) loss 0.3600 (0.3716) grad_norm 453384.7500 (inf) mem 14543MB +[2023-10-10 12:45:19 simmim_pretrain](main_simmim.py 218): INFO Train: [20/200][3000/6787] eta 0:15:54 lr 0.000200 time 0.2489 (0.2520) loss 0.3779 (0.3715) grad_norm 363761.8750 (inf) mem 14543MB +[2023-10-10 12:47:25 simmim_pretrain](main_simmim.py 218): INFO Train: [20/200][3500/6787] eta 0:13:48 lr 0.000200 time 0.2504 (0.2520) loss 0.3503 (0.3715) grad_norm 405188.3750 (inf) mem 14543MB +[2023-10-10 12:49:31 simmim_pretrain](main_simmim.py 218): INFO Train: [20/200][4000/6787] eta 0:11:42 lr 0.000200 time 0.2469 (0.2520) loss 0.3788 (0.3714) grad_norm 404538.0000 (inf) mem 14543MB +[2023-10-10 12:51:37 simmim_pretrain](main_simmim.py 218): INFO Train: [20/200][4500/6787] eta 0:09:36 lr 0.000200 time 0.2527 (0.2520) loss 0.3607 (0.3714) grad_norm 401065.4062 (inf) mem 14543MB +[2023-10-10 12:53:43 simmim_pretrain](main_simmim.py 218): INFO Train: [20/200][5000/6787] eta 0:07:30 lr 0.000200 time 0.2490 (0.2520) loss 0.3415 (0.3714) grad_norm 457884.4062 (inf) mem 14543MB +[2023-10-10 12:55:50 simmim_pretrain](main_simmim.py 218): INFO Train: [20/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2540 (0.2520) loss 0.3888 (0.3715) grad_norm 152101.7031 (inf) mem 14543MB +[2023-10-10 12:57:56 simmim_pretrain](main_simmim.py 218): INFO Train: [20/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2518 (0.2521) loss 0.3691 (0.3716) grad_norm 115370.9375 (inf) mem 14543MB +[2023-10-10 13:00:02 simmim_pretrain](main_simmim.py 218): INFO Train: [20/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2573 (0.2521) loss 0.3680 (0.3718) grad_norm 431211.0312 (inf) mem 14543MB +[2023-10-10 13:01:16 simmim_pretrain](main_simmim.py 228): INFO EPOCH 20 training takes 0:28:33 +[2023-10-10 13:01:16 simmim_pretrain](utils.py 62): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_20.pth saving...... +[2023-10-10 13:01:17 simmim_pretrain](utils.py 64): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_20.pth saved !!! +[2023-10-10 13:01:18 simmim_pretrain](main_simmim.py 218): INFO Train: [21/200][0/6787] eta 2:23:19 lr 0.000200 time 1.2671 (1.2671) loss 0.3834 (0.3834) grad_norm 235104.2188 (235104.2188) mem 14543MB +[2023-10-10 13:03:24 simmim_pretrain](main_simmim.py 218): INFO Train: [21/200][500/6787] eta 0:26:30 lr 0.000200 time 0.2570 (0.2530) loss 0.3685 (0.3731) grad_norm 222242.5000 (275479.8125) mem 14543MB +[2023-10-10 13:05:29 simmim_pretrain](main_simmim.py 218): INFO Train: [21/200][1000/6787] eta 0:24:17 lr 0.000200 time 0.2494 (0.2519) loss 0.3713 (0.3718) grad_norm 424868.4062 (327895.8125) mem 14543MB +[2023-10-10 13:07:35 simmim_pretrain](main_simmim.py 218): INFO Train: [21/200][1500/6787] eta 0:22:10 lr 0.000200 time 0.2467 (0.2516) loss 0.3673 (0.3716) grad_norm 630535.5625 (352256.3750) mem 14543MB +[2023-10-10 13:09:40 simmim_pretrain](main_simmim.py 218): INFO Train: [21/200][2000/6787] eta 0:20:04 lr 0.000200 time 0.2512 (0.2515) loss 0.3743 (0.3714) grad_norm 846767.5625 (389730.0625) mem 14543MB +[2023-10-10 13:11:46 simmim_pretrain](main_simmim.py 218): INFO Train: [21/200][2500/6787] eta 0:17:58 lr 0.000200 time 0.2588 (0.2516) loss 0.3435 (0.3713) grad_norm 334865.5625 (392149.5625) mem 14543MB +[2023-10-10 13:13:52 simmim_pretrain](main_simmim.py 218): INFO Train: [21/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2594 (0.2516) loss 0.3633 (0.3712) grad_norm 259036.5000 (inf) mem 14543MB +[2023-10-10 13:15:58 simmim_pretrain](main_simmim.py 218): INFO Train: [21/200][3500/6787] eta 0:13:47 lr 0.000200 time 0.2527 (0.2516) loss 0.3686 (0.3712) grad_norm 262700.6875 (inf) mem 14543MB +[2023-10-10 13:18:04 simmim_pretrain](main_simmim.py 218): INFO Train: [21/200][4000/6787] eta 0:11:41 lr 0.000200 time 0.2464 (0.2516) loss 0.3899 (0.3712) grad_norm 409823.5312 (inf) mem 14543MB +[2023-10-10 13:20:10 simmim_pretrain](main_simmim.py 218): INFO Train: [21/200][4500/6787] eta 0:09:35 lr 0.000200 time 0.2471 (0.2516) loss 0.3811 (0.3711) grad_norm 286651.1562 (inf) mem 14543MB +[2023-10-10 13:22:17 simmim_pretrain](main_simmim.py 218): INFO Train: [21/200][5000/6787] eta 0:07:30 lr 0.000200 time 0.2606 (0.2520) loss 0.3807 (0.3711) grad_norm 592382.5000 (inf) mem 14543MB +[2023-10-10 13:24:27 simmim_pretrain](main_simmim.py 218): INFO Train: [21/200][5500/6787] eta 0:05:25 lr 0.000200 time 0.2603 (0.2527) loss 0.3903 (0.3711) grad_norm 286478.8125 (inf) mem 14543MB +[2023-10-10 13:26:37 simmim_pretrain](main_simmim.py 218): INFO Train: [21/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2598 (0.2533) loss 0.3667 (0.3710) grad_norm 451122.3438 (inf) mem 14543MB +[2023-10-10 13:28:47 simmim_pretrain](main_simmim.py 218): INFO Train: [21/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2524 (0.2537) loss 0.3643 (0.3710) grad_norm 154783.6250 (inf) mem 14543MB +[2023-10-10 13:30:02 simmim_pretrain](main_simmim.py 228): INFO EPOCH 21 training takes 0:28:44 +[2023-10-10 13:30:03 simmim_pretrain](main_simmim.py 218): INFO Train: [22/200][0/6787] eta 2:31:40 lr 0.000200 time 1.3409 (1.3409) loss 0.3810 (0.3810) grad_norm 289879.5938 (289879.5938) mem 14543MB +[2023-10-10 13:32:09 simmim_pretrain](main_simmim.py 218): INFO Train: [22/200][500/6787] eta 0:26:35 lr 0.000200 time 0.2547 (0.2538) loss 0.3609 (0.3736) grad_norm 270675.0312 (268461.7188) mem 14543MB +[2023-10-10 13:34:15 simmim_pretrain](main_simmim.py 218): INFO Train: [22/200][1000/6787] eta 0:24:23 lr 0.000200 time 0.2518 (0.2529) loss 0.3704 (0.3732) grad_norm 394348.3750 (259644.0938) mem 14543MB +[2023-10-10 13:36:20 simmim_pretrain](main_simmim.py 218): INFO Train: [22/200][1500/6787] eta 0:22:14 lr 0.000200 time 0.2502 (0.2525) loss 0.3590 (0.3720) grad_norm 193207.6562 (273961.2812) mem 14543MB +[2023-10-10 13:38:26 simmim_pretrain](main_simmim.py 218): INFO Train: [22/200][2000/6787] eta 0:20:07 lr 0.000200 time 0.2446 (0.2522) loss 0.3753 (0.3715) grad_norm 227379.9531 (283740.4375) mem 14543MB +[2023-10-10 13:40:32 simmim_pretrain](main_simmim.py 218): INFO Train: [22/200][2500/6787] eta 0:18:00 lr 0.000200 time 0.2485 (0.2520) loss 0.3821 (0.3711) grad_norm 520665.6875 (300077.6875) mem 14543MB +[2023-10-10 13:42:37 simmim_pretrain](main_simmim.py 218): INFO Train: [22/200][3000/6787] eta 0:15:53 lr 0.000200 time 0.2529 (0.2519) loss 0.3750 (0.3708) grad_norm 405808.3438 (317226.6250) mem 14543MB +[2023-10-10 13:44:43 simmim_pretrain](main_simmim.py 218): INFO Train: [22/200][3500/6787] eta 0:13:47 lr 0.000200 time 0.2467 (0.2518) loss 0.3653 (0.3706) grad_norm 303378.8438 (inf) mem 14543MB +[2023-10-10 13:46:49 simmim_pretrain](main_simmim.py 218): INFO Train: [22/200][4000/6787] eta 0:11:41 lr 0.000200 time 0.2515 (0.2518) loss 0.3600 (0.3705) grad_norm 269864.5000 (inf) mem 14543MB +[2023-10-10 13:48:55 simmim_pretrain](main_simmim.py 218): INFO Train: [22/200][4500/6787] eta 0:09:35 lr 0.000200 time 0.2494 (0.2517) loss 0.3691 (0.3705) grad_norm 742575.8750 (inf) mem 14543MB +[2023-10-10 13:51:00 simmim_pretrain](main_simmim.py 218): INFO Train: [22/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2490 (0.2516) loss 0.3680 (0.3708) grad_norm 341472.4062 (inf) mem 14543MB +[2023-10-10 13:53:06 simmim_pretrain](main_simmim.py 218): INFO Train: [22/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2560 (0.2516) loss 0.3643 (0.3709) grad_norm 281018.0312 (inf) mem 14543MB +[2023-10-10 13:55:13 simmim_pretrain](main_simmim.py 218): INFO Train: [22/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2464 (0.2518) loss 0.3654 (0.3711) grad_norm 255319.9219 (inf) mem 14543MB +[2023-10-10 13:57:21 simmim_pretrain](main_simmim.py 218): INFO Train: [22/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2570 (0.2522) loss 0.3789 (0.3712) grad_norm 211942.9688 (inf) mem 14543MB +[2023-10-10 13:58:35 simmim_pretrain](main_simmim.py 228): INFO EPOCH 22 training takes 0:28:33 +[2023-10-10 13:58:36 simmim_pretrain](main_simmim.py 218): INFO Train: [23/200][0/6787] eta 2:41:54 lr 0.000200 time 1.4313 (1.4313) loss 0.3728 (0.3728) grad_norm 229706.9688 (229706.9688) mem 14543MB +[2023-10-10 14:00:44 simmim_pretrain](main_simmim.py 218): INFO Train: [23/200][500/6787] eta 0:27:05 lr 0.000200 time 0.2596 (0.2585) loss 0.3633 (0.3691) grad_norm 414433.1562 (420590.0000) mem 14543MB +[2023-10-10 14:02:53 simmim_pretrain](main_simmim.py 218): INFO Train: [23/200][1000/6787] eta 0:24:51 lr 0.000200 time 0.2571 (0.2577) loss 0.3611 (0.3691) grad_norm 436161.5938 (404555.8125) mem 14543MB +[2023-10-10 14:05:01 simmim_pretrain](main_simmim.py 218): INFO Train: [23/200][1500/6787] eta 0:22:40 lr 0.000200 time 0.2550 (0.2573) loss 0.3597 (0.3696) grad_norm 752669.5000 (415470.2188) mem 14543MB +[2023-10-10 14:07:10 simmim_pretrain](main_simmim.py 218): INFO Train: [23/200][2000/6787] eta 0:20:31 lr 0.000200 time 0.2558 (0.2572) loss 0.3747 (0.3697) grad_norm 398325.4375 (440738.3750) mem 14543MB +[2023-10-10 14:09:18 simmim_pretrain](main_simmim.py 218): INFO Train: [23/200][2500/6787] eta 0:18:22 lr 0.000200 time 0.2560 (0.2572) loss 0.3679 (0.3695) grad_norm 380331.3125 (inf) mem 14543MB +[2023-10-10 14:11:27 simmim_pretrain](main_simmim.py 218): INFO Train: [23/200][3000/6787] eta 0:16:13 lr 0.000200 time 0.2596 (0.2572) loss 0.3509 (0.3693) grad_norm 256807.2188 (inf) mem 14543MB +[2023-10-10 14:13:35 simmim_pretrain](main_simmim.py 218): INFO Train: [23/200][3500/6787] eta 0:14:04 lr 0.000200 time 0.2601 (0.2570) loss 0.3745 (0.3693) grad_norm 368639.8750 (inf) mem 14543MB +[2023-10-10 14:15:43 simmim_pretrain](main_simmim.py 218): INFO Train: [23/200][4000/6787] eta 0:11:56 lr 0.000200 time 0.2542 (0.2570) loss 0.3767 (0.3694) grad_norm 232656.0156 (inf) mem 14543MB +[2023-10-10 14:17:51 simmim_pretrain](main_simmim.py 218): INFO Train: [23/200][4500/6787] eta 0:09:47 lr 0.000200 time 0.2599 (0.2569) loss 0.3766 (0.3694) grad_norm 519942.1250 (inf) mem 14543MB +[2023-10-10 14:20:00 simmim_pretrain](main_simmim.py 218): INFO Train: [23/200][5000/6787] eta 0:07:39 lr 0.000200 time 0.2558 (0.2570) loss 0.3692 (0.3694) grad_norm 196221.0000 (inf) mem 14543MB +[2023-10-10 14:22:09 simmim_pretrain](main_simmim.py 218): INFO Train: [23/200][5500/6787] eta 0:05:30 lr 0.000200 time 0.2588 (0.2570) loss 0.3740 (0.3694) grad_norm 454338.6250 (inf) mem 14543MB +[2023-10-10 14:24:17 simmim_pretrain](main_simmim.py 218): INFO Train: [23/200][6000/6787] eta 0:03:22 lr 0.000200 time 0.2586 (0.2570) loss 0.3660 (0.3694) grad_norm 584048.8125 (inf) mem 14543MB +[2023-10-10 14:26:27 simmim_pretrain](main_simmim.py 218): INFO Train: [23/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2674 (0.2572) loss 0.3715 (0.3694) grad_norm 496958.9688 (inf) mem 14543MB +[2023-10-10 14:27:41 simmim_pretrain](main_simmim.py 228): INFO EPOCH 23 training takes 0:29:06 +[2023-10-10 14:27:42 simmim_pretrain](main_simmim.py 218): INFO Train: [24/200][0/6787] eta 2:32:36 lr 0.000200 time 1.3492 (1.3492) loss 0.3663 (0.3663) grad_norm 379275.5000 (379275.5000) mem 14543MB +[2023-10-10 14:29:51 simmim_pretrain](main_simmim.py 218): INFO Train: [24/200][500/6787] eta 0:27:15 lr 0.000200 time 0.2597 (0.2602) loss 0.3696 (0.3697) grad_norm 483015.3125 (464004.2188) mem 14543MB +[2023-10-10 14:32:00 simmim_pretrain](main_simmim.py 218): INFO Train: [24/200][1000/6787] eta 0:24:58 lr 0.000200 time 0.2615 (0.2590) loss 0.3676 (0.3705) grad_norm 348116.3750 (inf) mem 14543MB +[2023-10-10 14:34:10 simmim_pretrain](main_simmim.py 218): INFO Train: [24/200][1500/6787] eta 0:22:48 lr 0.000200 time 0.2604 (0.2589) loss 0.3557 (0.3712) grad_norm 233333.4531 (inf) mem 14543MB +[2023-10-10 14:36:18 simmim_pretrain](main_simmim.py 218): INFO Train: [24/200][2000/6787] eta 0:20:37 lr 0.000200 time 0.2571 (0.2586) loss 0.3783 (0.3716) grad_norm 469070.0938 (inf) mem 14543MB +[2023-10-10 14:38:28 simmim_pretrain](main_simmim.py 218): INFO Train: [24/200][2500/6787] eta 0:18:28 lr 0.000200 time 0.2593 (0.2585) loss 0.3658 (0.3714) grad_norm 397583.6875 (inf) mem 14543MB +[2023-10-10 14:40:37 simmim_pretrain](main_simmim.py 218): INFO Train: [24/200][3000/6787] eta 0:16:19 lr 0.000200 time 0.2541 (0.2585) loss 0.3706 (0.3713) grad_norm 334887.9062 (inf) mem 14543MB +[2023-10-10 14:42:46 simmim_pretrain](main_simmim.py 218): INFO Train: [24/200][3500/6787] eta 0:14:10 lr 0.000200 time 0.2584 (0.2586) loss 0.3691 (0.3709) grad_norm 303614.9062 (inf) mem 14543MB +[2023-10-10 14:44:56 simmim_pretrain](main_simmim.py 218): INFO Train: [24/200][4000/6787] eta 0:12:00 lr 0.000200 time 0.2583 (0.2586) loss 0.3598 (0.3708) grad_norm 268938.7188 (inf) mem 14543MB +[2023-10-10 14:47:05 simmim_pretrain](main_simmim.py 218): INFO Train: [24/200][4500/6787] eta 0:09:51 lr 0.000200 time 0.2702 (0.2585) loss 0.3562 (0.3709) grad_norm 341748.3438 (inf) mem 14543MB +[2023-10-10 14:49:14 simmim_pretrain](main_simmim.py 218): INFO Train: [24/200][5000/6787] eta 0:07:41 lr 0.000200 time 0.2560 (0.2585) loss 0.3705 (0.3709) grad_norm 261469.5469 (inf) mem 14543MB +[2023-10-10 14:51:23 simmim_pretrain](main_simmim.py 218): INFO Train: [24/200][5500/6787] eta 0:05:32 lr 0.000200 time 0.2587 (0.2585) loss 0.3703 (0.3709) grad_norm 298047.9688 (inf) mem 14543MB +[2023-10-10 14:53:32 simmim_pretrain](main_simmim.py 218): INFO Train: [24/200][6000/6787] eta 0:03:23 lr 0.000200 time 0.2491 (0.2585) loss 0.3607 (0.3708) grad_norm 342671.8438 (inf) mem 14543MB +[2023-10-10 14:55:42 simmim_pretrain](main_simmim.py 218): INFO Train: [24/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2568 (0.2585) loss 0.3590 (0.3706) grad_norm 449279.7188 (inf) mem 14543MB +[2023-10-10 14:56:56 simmim_pretrain](main_simmim.py 228): INFO EPOCH 24 training takes 0:29:15 +[2023-10-10 14:56:58 simmim_pretrain](main_simmim.py 218): INFO Train: [25/200][0/6787] eta 2:19:38 lr 0.000200 time 1.2346 (1.2346) loss 0.3524 (0.3524) grad_norm 642508.6250 (642508.6250) mem 14543MB +[2023-10-10 14:59:07 simmim_pretrain](main_simmim.py 218): INFO Train: [25/200][500/6787] eta 0:27:13 lr 0.000200 time 0.2540 (0.2598) loss 0.3810 (0.3692) grad_norm 483071.6875 (447253.1562) mem 14543MB +[2023-10-10 15:01:15 simmim_pretrain](main_simmim.py 218): INFO Train: [25/200][1000/6787] eta 0:24:56 lr 0.000200 time 0.2555 (0.2586) loss 0.3376 (0.3692) grad_norm 475614.5625 (inf) mem 14543MB +[2023-10-10 15:03:24 simmim_pretrain](main_simmim.py 218): INFO Train: [25/200][1500/6787] eta 0:22:45 lr 0.000200 time 0.2480 (0.2582) loss 0.3769 (0.3690) grad_norm 412166.6875 (inf) mem 14543MB +[2023-10-10 15:05:33 simmim_pretrain](main_simmim.py 218): INFO Train: [25/200][2000/6787] eta 0:20:35 lr 0.000200 time 0.2608 (0.2581) loss 0.4118 (0.3691) grad_norm 215914.2188 (inf) mem 14543MB +[2023-10-10 15:07:42 simmim_pretrain](main_simmim.py 218): INFO Train: [25/200][2500/6787] eta 0:18:26 lr 0.000200 time 0.2592 (0.2580) loss 0.3598 (0.3696) grad_norm 356511.2188 (inf) mem 14543MB +[2023-10-10 15:09:51 simmim_pretrain](main_simmim.py 218): INFO Train: [25/200][3000/6787] eta 0:16:17 lr 0.000200 time 0.2543 (0.2580) loss 0.3507 (0.3698) grad_norm 249067.2031 (inf) mem 14543MB +[2023-10-10 15:12:00 simmim_pretrain](main_simmim.py 218): INFO Train: [25/200][3500/6787] eta 0:14:07 lr 0.000200 time 0.2595 (0.2580) loss 0.3824 (0.3700) grad_norm 306355.7500 (inf) mem 14543MB +[2023-10-10 15:14:09 simmim_pretrain](main_simmim.py 218): INFO Train: [25/200][4000/6787] eta 0:11:59 lr 0.000200 time 0.2556 (0.2580) loss 0.3645 (0.3699) grad_norm 264329.4688 (inf) mem 14543MB +[2023-10-10 15:16:18 simmim_pretrain](main_simmim.py 218): INFO Train: [25/200][4500/6787] eta 0:09:50 lr 0.000200 time 0.2596 (0.2581) loss 0.3684 (0.3698) grad_norm 201503.5312 (inf) mem 14543MB +[2023-10-10 15:18:27 simmim_pretrain](main_simmim.py 218): INFO Train: [25/200][5000/6787] eta 0:07:41 lr 0.000200 time 0.2620 (0.2581) loss 0.3564 (0.3697) grad_norm 391118.1875 (inf) mem 14543MB +[2023-10-10 15:20:36 simmim_pretrain](main_simmim.py 218): INFO Train: [25/200][5500/6787] eta 0:05:32 lr 0.000200 time 0.2599 (0.2581) loss 0.3547 (0.3696) grad_norm 544776.5000 (inf) mem 14543MB +[2023-10-10 15:22:45 simmim_pretrain](main_simmim.py 218): INFO Train: [25/200][6000/6787] eta 0:03:23 lr 0.000200 time 0.3232 (0.2581) loss 0.3382 (0.3695) grad_norm 322811.3438 (inf) mem 14543MB +[2023-10-10 15:24:54 simmim_pretrain](main_simmim.py 218): INFO Train: [25/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2552 (0.2581) loss 0.3623 (0.3695) grad_norm 281809.4062 (inf) mem 14543MB +[2023-10-10 15:26:09 simmim_pretrain](main_simmim.py 228): INFO EPOCH 25 training takes 0:29:12 +[2023-10-10 15:26:10 simmim_pretrain](main_simmim.py 218): INFO Train: [26/200][0/6787] eta 2:32:04 lr 0.000200 time 1.3445 (1.3445) loss 0.3839 (0.3839) grad_norm 391179.9375 (391179.9375) mem 14543MB +[2023-10-10 15:28:20 simmim_pretrain](main_simmim.py 218): INFO Train: [26/200][500/6787] eta 0:27:21 lr 0.000200 time 0.2600 (0.2610) loss 0.3830 (0.3718) grad_norm 160152.8594 (280513.9688) mem 14543MB +[2023-10-10 15:30:29 simmim_pretrain](main_simmim.py 218): INFO Train: [26/200][1000/6787] eta 0:25:01 lr 0.000200 time 0.2572 (0.2595) loss 0.3767 (0.3703) grad_norm 212435.6250 (273766.3438) mem 14543MB +[2023-10-10 15:32:38 simmim_pretrain](main_simmim.py 218): INFO Train: [26/200][1500/6787] eta 0:22:50 lr 0.000200 time 0.2560 (0.2592) loss 0.3584 (0.3704) grad_norm 289258.7188 (inf) mem 14543MB +[2023-10-10 15:34:47 simmim_pretrain](main_simmim.py 218): INFO Train: [26/200][2000/6787] eta 0:20:38 lr 0.000200 time 0.2600 (0.2588) loss 0.3674 (0.3704) grad_norm 219745.4219 (inf) mem 14543MB +[2023-10-10 15:36:56 simmim_pretrain](main_simmim.py 218): INFO Train: [26/200][2500/6787] eta 0:18:29 lr 0.000200 time 0.2580 (0.2587) loss 0.3742 (0.3705) grad_norm 251749.8594 (inf) mem 14543MB +[2023-10-10 15:39:05 simmim_pretrain](main_simmim.py 218): INFO Train: [26/200][3000/6787] eta 0:16:19 lr 0.000200 time 0.2593 (0.2587) loss 0.3725 (0.3704) grad_norm 155780.5312 (inf) mem 14543MB +[2023-10-10 15:41:15 simmim_pretrain](main_simmim.py 218): INFO Train: [26/200][3500/6787] eta 0:14:10 lr 0.000200 time 0.2575 (0.2589) loss 0.3571 (0.3702) grad_norm 587182.5000 (inf) mem 14543MB +[2023-10-10 15:43:25 simmim_pretrain](main_simmim.py 218): INFO Train: [26/200][4000/6787] eta 0:12:01 lr 0.000200 time 0.2541 (0.2588) loss 0.3812 (0.3700) grad_norm 330259.1875 (inf) mem 14543MB +[2023-10-10 15:45:34 simmim_pretrain](main_simmim.py 218): INFO Train: [26/200][4500/6787] eta 0:09:51 lr 0.000200 time 0.2606 (0.2587) loss 0.3644 (0.3700) grad_norm 410214.2500 (inf) mem 14543MB +[2023-10-10 15:47:43 simmim_pretrain](main_simmim.py 218): INFO Train: [26/200][5000/6787] eta 0:07:42 lr 0.000200 time 0.2620 (0.2588) loss 0.3703 (0.3698) grad_norm 684560.3750 (inf) mem 14543MB +[2023-10-10 15:49:53 simmim_pretrain](main_simmim.py 218): INFO Train: [26/200][5500/6787] eta 0:05:33 lr 0.000200 time 0.2609 (0.2589) loss 0.3766 (0.3697) grad_norm 433148.9688 (inf) mem 14543MB +[2023-10-10 15:52:03 simmim_pretrain](main_simmim.py 218): INFO Train: [26/200][6000/6787] eta 0:03:23 lr 0.000200 time 0.2597 (0.2589) loss 0.3448 (0.3696) grad_norm 389326.5000 (inf) mem 14543MB +[2023-10-10 15:54:11 simmim_pretrain](main_simmim.py 218): INFO Train: [26/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2617 (0.2588) loss 0.4199 (0.3753) grad_norm 21360.9102 (inf) mem 14543MB +[2023-10-10 15:55:26 simmim_pretrain](main_simmim.py 228): INFO EPOCH 26 training takes 0:29:16 +[2023-10-10 15:55:27 simmim_pretrain](main_simmim.py 218): INFO Train: [27/200][0/6787] eta 2:22:33 lr 0.000200 time 1.2603 (1.2603) loss 0.3957 (0.3957) grad_norm 37415.8945 (37415.8945) mem 14543MB +[2023-10-10 15:57:36 simmim_pretrain](main_simmim.py 218): INFO Train: [27/200][500/6787] eta 0:27:18 lr 0.000200 time 0.2585 (0.2606) loss 0.3674 (0.3884) grad_norm 23052.9512 (31458.7070) mem 14543MB +[2023-10-10 15:59:46 simmim_pretrain](main_simmim.py 218): INFO Train: [27/200][1000/6787] eta 0:25:05 lr 0.000200 time 0.2540 (0.2601) loss 0.3839 (0.3843) grad_norm 22628.7949 (31893.4414) mem 14543MB +[2023-10-10 16:01:56 simmim_pretrain](main_simmim.py 218): INFO Train: [27/200][1500/6787] eta 0:22:54 lr 0.000200 time 0.2680 (0.2600) loss 0.3809 (0.3815) grad_norm 28995.1387 (32161.4082) mem 14543MB +[2023-10-10 16:04:06 simmim_pretrain](main_simmim.py 218): INFO Train: [27/200][2000/6787] eta 0:20:44 lr 0.000200 time 0.2663 (0.2599) loss 0.3732 (0.3793) grad_norm 55555.1836 (35052.3203) mem 14543MB +[2023-10-10 16:06:15 simmim_pretrain](main_simmim.py 218): INFO Train: [27/200][2500/6787] eta 0:18:33 lr 0.000200 time 0.2539 (0.2597) loss 0.3566 (0.3779) grad_norm 59955.3164 (38509.3945) mem 14543MB +[2023-10-10 16:08:24 simmim_pretrain](main_simmim.py 218): INFO Train: [27/200][3000/6787] eta 0:16:21 lr 0.000200 time 0.2657 (0.2593) loss 0.3595 (0.3769) grad_norm 45988.8633 (41120.0352) mem 14543MB +[2023-10-10 16:10:34 simmim_pretrain](main_simmim.py 218): INFO Train: [27/200][3500/6787] eta 0:14:12 lr 0.000200 time 0.2578 (0.2593) loss 0.3818 (0.3761) grad_norm 49253.2695 (43320.5508) mem 14543MB +[2023-10-10 16:12:43 simmim_pretrain](main_simmim.py 218): INFO Train: [27/200][4000/6787] eta 0:12:02 lr 0.000200 time 0.2587 (0.2593) loss 0.3730 (0.3753) grad_norm 79705.6094 (47119.9258) mem 14543MB +[2023-10-10 16:14:54 simmim_pretrain](main_simmim.py 218): INFO Train: [27/200][4500/6787] eta 0:09:53 lr 0.000200 time 0.2582 (0.2596) loss 0.3524 (0.3746) grad_norm 138937.0781 (52247.7617) mem 14543MB +[2023-10-10 16:17:04 simmim_pretrain](main_simmim.py 218): INFO Train: [27/200][5000/6787] eta 0:07:43 lr 0.000200 time 0.2699 (0.2596) loss 0.3621 (0.3741) grad_norm 96436.4453 (56802.6055) mem 14543MB +[2023-10-10 16:19:13 simmim_pretrain](main_simmim.py 218): INFO Train: [27/200][5500/6787] eta 0:05:33 lr 0.000200 time 0.2577 (0.2595) loss 0.3819 (0.3736) grad_norm 85795.3125 (61281.0195) mem 14543MB +[2023-10-10 16:21:22 simmim_pretrain](main_simmim.py 218): INFO Train: [27/200][6000/6787] eta 0:03:24 lr 0.000200 time 0.2525 (0.2594) loss 0.3586 (0.3732) grad_norm 61373.0898 (68195.5078) mem 14543MB +[2023-10-10 16:23:32 simmim_pretrain](main_simmim.py 218): INFO Train: [27/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2507 (0.2593) loss 0.3549 (0.3729) grad_norm 83640.8047 (74236.5156) mem 14543MB +[2023-10-10 16:24:47 simmim_pretrain](main_simmim.py 228): INFO EPOCH 27 training takes 0:29:21 +[2023-10-10 16:24:48 simmim_pretrain](main_simmim.py 218): INFO Train: [28/200][0/6787] eta 2:30:28 lr 0.000200 time 1.3302 (1.3302) loss 0.3544 (0.3544) grad_norm 187873.9531 (187873.9531) mem 14543MB +[2023-10-10 16:26:58 simmim_pretrain](main_simmim.py 218): INFO Train: [28/200][500/6787] eta 0:27:29 lr 0.000200 time 0.3378 (0.2624) loss 0.3951 (0.3669) grad_norm 63378.6602 (174702.1250) mem 14543MB +[2023-10-10 16:29:08 simmim_pretrain](main_simmim.py 218): INFO Train: [28/200][1000/6787] eta 0:25:11 lr 0.000200 time 0.2545 (0.2611) loss 0.3523 (0.3669) grad_norm 201273.6875 (195749.8281) mem 14543MB +[2023-10-10 16:31:17 simmim_pretrain](main_simmim.py 218): INFO Train: [28/200][1500/6787] eta 0:22:54 lr 0.000200 time 0.2578 (0.2601) loss 0.3694 (0.3672) grad_norm 253867.1719 (216915.2969) mem 14543MB +[2023-10-10 16:33:27 simmim_pretrain](main_simmim.py 218): INFO Train: [28/200][2000/6787] eta 0:20:43 lr 0.000200 time 0.2603 (0.2598) loss 0.3662 (0.3672) grad_norm 398629.9062 (268910.2188) mem 14543MB +[2023-10-10 16:35:37 simmim_pretrain](main_simmim.py 218): INFO Train: [28/200][2500/6787] eta 0:18:33 lr 0.000200 time 0.2618 (0.2597) loss 0.3522 (0.3669) grad_norm 674539.5000 (295994.8438) mem 14543MB +[2023-10-10 16:37:47 simmim_pretrain](main_simmim.py 218): INFO Train: [28/200][3000/6787] eta 0:16:23 lr 0.000200 time 0.2590 (0.2597) loss 0.3657 (0.3670) grad_norm 339590.1250 (inf) mem 14543MB +[2023-10-10 16:39:57 simmim_pretrain](main_simmim.py 218): INFO Train: [28/200][3500/6787] eta 0:14:14 lr 0.000200 time 0.2547 (0.2600) loss 0.3941 (0.3670) grad_norm 301137.6875 (inf) mem 14543MB +[2023-10-10 16:42:06 simmim_pretrain](main_simmim.py 218): INFO Train: [28/200][4000/6787] eta 0:12:03 lr 0.000200 time 0.2603 (0.2596) loss 0.3630 (0.3670) grad_norm 308230.6562 (inf) mem 14543MB +[2023-10-10 16:44:15 simmim_pretrain](main_simmim.py 218): INFO Train: [28/200][4500/6787] eta 0:09:53 lr 0.000200 time 0.2593 (0.2594) loss 0.3493 (0.3667) grad_norm 487719.5625 (inf) mem 14543MB +[2023-10-10 16:46:24 simmim_pretrain](main_simmim.py 218): INFO Train: [28/200][5000/6787] eta 0:07:43 lr 0.000200 time 0.2737 (0.2594) loss 0.3537 (0.3667) grad_norm 498045.3438 (inf) mem 14543MB +[2023-10-10 16:48:34 simmim_pretrain](main_simmim.py 218): INFO Train: [28/200][5500/6787] eta 0:05:33 lr 0.000200 time 0.2605 (0.2594) loss 0.3700 (0.3668) grad_norm 321145.5625 (inf) mem 14543MB +[2023-10-10 16:50:44 simmim_pretrain](main_simmim.py 218): INFO Train: [28/200][6000/6787] eta 0:03:24 lr 0.000200 time 0.2558 (0.2594) loss 0.3826 (0.3668) grad_norm 329313.7500 (inf) mem 14543MB +[2023-10-10 16:52:53 simmim_pretrain](main_simmim.py 218): INFO Train: [28/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2597 (0.2594) loss 0.3861 (0.3669) grad_norm 391732.5938 (inf) mem 14543MB +[2023-10-10 16:54:08 simmim_pretrain](main_simmim.py 228): INFO EPOCH 28 training takes 0:29:20 +[2023-10-10 16:54:09 simmim_pretrain](main_simmim.py 218): INFO Train: [29/200][0/6787] eta 2:32:37 lr 0.000200 time 1.3493 (1.3493) loss 0.3603 (0.3603) grad_norm 315861.4375 (315861.4375) mem 14543MB +[2023-10-10 16:56:18 simmim_pretrain](main_simmim.py 218): INFO Train: [29/200][500/6787] eta 0:27:14 lr 0.000200 time 0.2595 (0.2600) loss 0.3534 (0.3667) grad_norm 434402.4375 (inf) mem 14543MB +[2023-10-10 16:58:27 simmim_pretrain](main_simmim.py 218): INFO Train: [29/200][1000/6787] eta 0:24:59 lr 0.000200 time 0.2564 (0.2591) loss 0.3626 (0.3665) grad_norm 366216.0312 (inf) mem 14543MB +[2023-10-10 17:00:37 simmim_pretrain](main_simmim.py 218): INFO Train: [29/200][1500/6787] eta 0:22:51 lr 0.000200 time 0.2586 (0.2593) loss 0.3572 (0.3663) grad_norm 355779.8438 (inf) mem 14543MB +[2023-10-10 17:02:47 simmim_pretrain](main_simmim.py 218): INFO Train: [29/200][2000/6787] eta 0:20:43 lr 0.000200 time 0.2625 (0.2598) loss 0.3775 (0.3664) grad_norm 151953.0000 (inf) mem 14543MB +[2023-10-10 17:04:57 simmim_pretrain](main_simmim.py 218): INFO Train: [29/200][2500/6787] eta 0:18:33 lr 0.000200 time 0.2515 (0.2598) loss 0.3604 (0.3667) grad_norm 291492.6875 (inf) mem 14543MB +[2023-10-10 17:07:06 simmim_pretrain](main_simmim.py 218): INFO Train: [29/200][3000/6787] eta 0:16:22 lr 0.000200 time 0.2589 (0.2595) loss 0.3652 (0.3669) grad_norm 173830.3906 (inf) mem 14543MB +[2023-10-10 17:09:16 simmim_pretrain](main_simmim.py 218): INFO Train: [29/200][3500/6787] eta 0:14:13 lr 0.000200 time 0.2566 (0.2596) loss 0.3779 (0.3670) grad_norm 217783.1875 (inf) mem 14543MB +[2023-10-10 17:11:26 simmim_pretrain](main_simmim.py 218): INFO Train: [29/200][4000/6787] eta 0:12:03 lr 0.000200 time 0.2574 (0.2596) loss 0.3743 (0.3672) grad_norm 218028.3438 (inf) mem 14543MB +[2023-10-10 17:13:37 simmim_pretrain](main_simmim.py 218): INFO Train: [29/200][4500/6787] eta 0:09:53 lr 0.000200 time 0.2543 (0.2597) loss 0.3449 (0.3673) grad_norm 306959.4688 (inf) mem 14543MB +[2023-10-10 17:15:46 simmim_pretrain](main_simmim.py 218): INFO Train: [29/200][5000/6787] eta 0:07:44 lr 0.000200 time 0.2528 (0.2597) loss 0.3803 (0.3672) grad_norm 335075.6875 (inf) mem 14543MB +[2023-10-10 17:17:55 simmim_pretrain](main_simmim.py 218): INFO Train: [29/200][5500/6787] eta 0:05:34 lr 0.000200 time 0.2551 (0.2596) loss 0.3613 (0.3672) grad_norm 368315.3438 (inf) mem 14543MB +[2023-10-10 17:20:06 simmim_pretrain](main_simmim.py 218): INFO Train: [29/200][6000/6787] eta 0:03:24 lr 0.000200 time 0.2604 (0.2597) loss 0.3827 (0.3671) grad_norm 491268.8438 (inf) mem 14543MB +[2023-10-10 17:22:16 simmim_pretrain](main_simmim.py 218): INFO Train: [29/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2598 (0.2597) loss 0.3631 (0.3670) grad_norm 389803.7500 (inf) mem 14543MB +[2023-10-10 17:23:30 simmim_pretrain](main_simmim.py 228): INFO EPOCH 29 training takes 0:29:22 +[2023-10-10 17:23:32 simmim_pretrain](main_simmim.py 218): INFO Train: [30/200][0/6787] eta 2:40:39 lr 0.000200 time 1.4203 (1.4203) loss 0.3706 (0.3706) grad_norm 325290.2188 (325290.2188) mem 14543MB +[2023-10-10 17:25:42 simmim_pretrain](main_simmim.py 218): INFO Train: [30/200][500/6787] eta 0:27:26 lr 0.000200 time 0.2592 (0.2619) loss 0.3641 (0.3672) grad_norm 310988.2812 (293879.4062) mem 14543MB +[2023-10-10 17:27:51 simmim_pretrain](main_simmim.py 218): INFO Train: [30/200][1000/6787] eta 0:25:09 lr 0.000200 time 0.2510 (0.2608) loss 0.3746 (0.3677) grad_norm 199780.5156 (271692.2812) mem 14543MB +[2023-10-10 17:30:01 simmim_pretrain](main_simmim.py 218): INFO Train: [30/200][1500/6787] eta 0:22:54 lr 0.000200 time 0.2555 (0.2601) loss 0.3648 (0.3676) grad_norm 241104.8281 (262166.2812) mem 14543MB +[2023-10-10 17:32:11 simmim_pretrain](main_simmim.py 218): INFO Train: [30/200][2000/6787] eta 0:20:44 lr 0.000200 time 0.2571 (0.2600) loss 0.3735 (0.3675) grad_norm 590034.1875 (256445.9219) mem 14543MB +[2023-10-10 17:34:20 simmim_pretrain](main_simmim.py 218): INFO Train: [30/200][2500/6787] eta 0:18:34 lr 0.000200 time 0.2565 (0.2599) loss 0.3631 (0.3675) grad_norm 388314.3750 (263672.1562) mem 14543MB +[2023-10-10 17:36:30 simmim_pretrain](main_simmim.py 218): INFO Train: [30/200][3000/6787] eta 0:16:23 lr 0.000200 time 0.2633 (0.2597) loss 0.3639 (0.3674) grad_norm 335827.6562 (284529.8125) mem 14543MB +[2023-10-10 17:38:38 simmim_pretrain](main_simmim.py 218): INFO Train: [30/200][3500/6787] eta 0:14:12 lr 0.000200 time 0.2581 (0.2593) loss 0.3755 (0.3673) grad_norm 264653.5938 (297412.7188) mem 14543MB +[2023-10-10 17:40:47 simmim_pretrain](main_simmim.py 218): INFO Train: [30/200][4000/6787] eta 0:12:01 lr 0.000200 time 0.2525 (0.2590) loss 0.3548 (0.3672) grad_norm inf (inf) mem 14543MB +[2023-10-10 17:42:56 simmim_pretrain](main_simmim.py 218): INFO Train: [30/200][4500/6787] eta 0:09:52 lr 0.000200 time 0.2599 (0.2589) loss 0.3550 (0.3669) grad_norm 574306.4375 (inf) mem 14543MB +[2023-10-10 17:45:05 simmim_pretrain](main_simmim.py 218): INFO Train: [30/200][5000/6787] eta 0:07:42 lr 0.000200 time 0.2599 (0.2589) loss 0.3664 (0.3668) grad_norm 356478.4688 (inf) mem 14543MB +[2023-10-10 17:47:15 simmim_pretrain](main_simmim.py 218): INFO Train: [30/200][5500/6787] eta 0:05:33 lr 0.000200 time 0.2587 (0.2589) loss 0.3625 (0.3668) grad_norm 197556.9219 (inf) mem 14543MB +[2023-10-10 17:49:24 simmim_pretrain](main_simmim.py 218): INFO Train: [30/200][6000/6787] eta 0:03:23 lr 0.000200 time 0.2586 (0.2589) loss 0.3650 (0.3668) grad_norm 185451.1719 (inf) mem 14543MB +[2023-10-10 17:51:34 simmim_pretrain](main_simmim.py 218): INFO Train: [30/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2600 (0.2589) loss 0.3594 (0.3668) grad_norm 243726.2812 (inf) mem 14543MB +[2023-10-10 17:52:48 simmim_pretrain](main_simmim.py 228): INFO EPOCH 30 training takes 0:29:18 +[2023-10-10 17:52:50 simmim_pretrain](main_simmim.py 218): INFO Train: [31/200][0/6787] eta 2:47:00 lr 0.000200 time 1.4765 (1.4765) loss 0.3584 (0.3584) grad_norm 279391.9375 (279391.9375) mem 14543MB +[2023-10-10 17:54:55 simmim_pretrain](main_simmim.py 218): INFO Train: [31/200][500/6787] eta 0:26:28 lr 0.000200 time 0.2529 (0.2526) loss 0.3634 (0.3679) grad_norm 199318.0469 (300677.0625) mem 14543MB +[2023-10-10 17:57:01 simmim_pretrain](main_simmim.py 218): INFO Train: [31/200][1000/6787] eta 0:24:17 lr 0.000200 time 0.2514 (0.2518) loss 0.3859 (0.3678) grad_norm 134295.1719 (inf) mem 14543MB +[2023-10-10 17:59:06 simmim_pretrain](main_simmim.py 218): INFO Train: [31/200][1500/6787] eta 0:22:10 lr 0.000200 time 0.2488 (0.2517) loss 0.3656 (0.3686) grad_norm 172415.0156 (inf) mem 14543MB +[2023-10-10 18:01:12 simmim_pretrain](main_simmim.py 218): INFO Train: [31/200][2000/6787] eta 0:20:05 lr 0.000200 time 0.2463 (0.2517) loss 0.3565 (0.3691) grad_norm 109556.2266 (inf) mem 14543MB +[2023-10-10 18:03:18 simmim_pretrain](main_simmim.py 218): INFO Train: [31/200][2500/6787] eta 0:17:58 lr 0.000200 time 0.2582 (0.2515) loss 0.3634 (0.3693) grad_norm 97669.3125 (inf) mem 14543MB +[2023-10-10 18:05:23 simmim_pretrain](main_simmim.py 218): INFO Train: [31/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2495 (0.2516) loss 0.3709 (0.3691) grad_norm 84036.6562 (inf) mem 14543MB +[2023-10-10 18:07:30 simmim_pretrain](main_simmim.py 218): INFO Train: [31/200][3500/6787] eta 0:13:47 lr 0.000200 time 0.2586 (0.2517) loss 0.3605 (0.3689) grad_norm 99617.6484 (inf) mem 14543MB +[2023-10-10 18:09:36 simmim_pretrain](main_simmim.py 218): INFO Train: [31/200][4000/6787] eta 0:11:41 lr 0.000200 time 0.2593 (0.2518) loss 0.3570 (0.3687) grad_norm 95499.8047 (inf) mem 14543MB +[2023-10-10 18:11:42 simmim_pretrain](main_simmim.py 218): INFO Train: [31/200][4500/6787] eta 0:09:36 lr 0.000200 time 0.2465 (0.2519) loss 0.3641 (0.3686) grad_norm 235403.1562 (inf) mem 14543MB +[2023-10-10 18:13:48 simmim_pretrain](main_simmim.py 218): INFO Train: [31/200][5000/6787] eta 0:07:30 lr 0.000200 time 0.2507 (0.2520) loss 0.3516 (0.3685) grad_norm 161023.8750 (inf) mem 14543MB +[2023-10-10 18:15:55 simmim_pretrain](main_simmim.py 218): INFO Train: [31/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2511 (0.2520) loss 0.3751 (0.3683) grad_norm 241905.4688 (inf) mem 14543MB +[2023-10-10 18:18:03 simmim_pretrain](main_simmim.py 218): INFO Train: [31/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2599 (0.2524) loss 0.3895 (0.3680) grad_norm 471995.2500 (inf) mem 14543MB +[2023-10-10 18:20:12 simmim_pretrain](main_simmim.py 218): INFO Train: [31/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2575 (0.2529) loss 0.3712 (0.3678) grad_norm 501597.9375 (inf) mem 14543MB +[2023-10-10 18:21:27 simmim_pretrain](main_simmim.py 228): INFO EPOCH 31 training takes 0:28:38 +[2023-10-10 18:21:29 simmim_pretrain](main_simmim.py 218): INFO Train: [32/200][0/6787] eta 2:45:23 lr 0.000200 time 1.4621 (1.4621) loss 0.3753 (0.3753) grad_norm 374420.7812 (374420.7812) mem 14543MB +[2023-10-10 18:23:35 simmim_pretrain](main_simmim.py 218): INFO Train: [32/200][500/6787] eta 0:26:45 lr 0.000200 time 0.2468 (0.2554) loss 0.3577 (0.3658) grad_norm 540711.9375 (383967.7188) mem 14543MB +[2023-10-10 18:25:41 simmim_pretrain](main_simmim.py 218): INFO Train: [32/200][1000/6787] eta 0:24:29 lr 0.000200 time 0.2494 (0.2540) loss 0.3798 (0.3660) grad_norm 258010.8750 (inf) mem 14543MB +[2023-10-10 18:27:48 simmim_pretrain](main_simmim.py 218): INFO Train: [32/200][1500/6787] eta 0:22:22 lr 0.000200 time 0.2598 (0.2540) loss 0.3689 (0.3662) grad_norm 159268.8281 (inf) mem 14543MB +[2023-10-10 18:29:58 simmim_pretrain](main_simmim.py 218): INFO Train: [32/200][2000/6787] eta 0:20:20 lr 0.000200 time 0.2572 (0.2551) loss 0.3899 (0.3666) grad_norm 260919.7344 (inf) mem 14543MB +[2023-10-10 18:32:06 simmim_pretrain](main_simmim.py 218): INFO Train: [32/200][2500/6787] eta 0:18:15 lr 0.000200 time 0.2573 (0.2554) loss 0.3765 (0.3669) grad_norm 171259.2188 (inf) mem 14543MB +[2023-10-10 18:34:15 simmim_pretrain](main_simmim.py 218): INFO Train: [32/200][3000/6787] eta 0:16:08 lr 0.000200 time 0.2580 (0.2557) loss 0.3593 (0.3666) grad_norm 201563.5312 (inf) mem 14543MB +[2023-10-10 18:36:23 simmim_pretrain](main_simmim.py 218): INFO Train: [32/200][3500/6787] eta 0:14:01 lr 0.000200 time 0.2596 (0.2559) loss 0.3660 (0.3665) grad_norm 284134.8750 (inf) mem 14543MB +[2023-10-10 18:38:32 simmim_pretrain](main_simmim.py 218): INFO Train: [32/200][4000/6787] eta 0:11:53 lr 0.000200 time 0.2570 (0.2560) loss 0.3641 (0.3666) grad_norm 294874.4688 (inf) mem 14543MB +[2023-10-10 18:40:40 simmim_pretrain](main_simmim.py 218): INFO Train: [32/200][4500/6787] eta 0:09:45 lr 0.000200 time 0.2594 (0.2562) loss 0.3833 (0.3666) grad_norm 265753.1562 (inf) mem 14543MB +[2023-10-10 18:42:49 simmim_pretrain](main_simmim.py 218): INFO Train: [32/200][5000/6787] eta 0:07:37 lr 0.000200 time 0.2547 (0.2562) loss 0.3934 (0.3666) grad_norm 160660.1406 (inf) mem 14543MB +[2023-10-10 18:44:57 simmim_pretrain](main_simmim.py 218): INFO Train: [32/200][5500/6787] eta 0:05:29 lr 0.000200 time 0.2572 (0.2563) loss 0.3668 (0.3666) grad_norm 650383.4375 (inf) mem 14543MB +[2023-10-10 18:47:06 simmim_pretrain](main_simmim.py 218): INFO Train: [32/200][6000/6787] eta 0:03:21 lr 0.000200 time 0.2571 (0.2564) loss 0.3618 (0.3666) grad_norm 218859.6562 (inf) mem 14543MB +[2023-10-10 18:49:14 simmim_pretrain](main_simmim.py 218): INFO Train: [32/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2577 (0.2564) loss 0.3807 (0.3666) grad_norm 311430.9062 (inf) mem 14543MB +[2023-10-10 18:50:28 simmim_pretrain](main_simmim.py 228): INFO EPOCH 32 training takes 0:29:01 +[2023-10-10 18:50:30 simmim_pretrain](main_simmim.py 218): INFO Train: [33/200][0/6787] eta 2:36:37 lr 0.000200 time 1.3846 (1.3846) loss 0.3678 (0.3678) grad_norm 293304.9375 (293304.9375) mem 14543MB +[2023-10-10 18:52:36 simmim_pretrain](main_simmim.py 218): INFO Train: [33/200][500/6787] eta 0:26:34 lr 0.000200 time 0.2477 (0.2536) loss 0.3506 (0.3671) grad_norm 188162.4375 (257625.1719) mem 14543MB +[2023-10-10 18:54:42 simmim_pretrain](main_simmim.py 218): INFO Train: [33/200][1000/6787] eta 0:24:23 lr 0.000200 time 0.2519 (0.2529) loss 0.3647 (0.3667) grad_norm 351517.0625 (262316.6875) mem 14543MB +[2023-10-10 18:56:48 simmim_pretrain](main_simmim.py 218): INFO Train: [33/200][1500/6787] eta 0:22:15 lr 0.000200 time 0.2507 (0.2526) loss 0.3523 (0.3659) grad_norm 408499.0938 (284289.0625) mem 14543MB +[2023-10-10 18:58:54 simmim_pretrain](main_simmim.py 218): INFO Train: [33/200][2000/6787] eta 0:20:08 lr 0.000200 time 0.2541 (0.2524) loss 0.3555 (0.3660) grad_norm 317959.3438 (inf) mem 14543MB +[2023-10-10 19:01:00 simmim_pretrain](main_simmim.py 218): INFO Train: [33/200][2500/6787] eta 0:18:01 lr 0.000200 time 0.2588 (0.2523) loss 0.3766 (0.3663) grad_norm 296599.7812 (inf) mem 14543MB +[2023-10-10 19:03:06 simmim_pretrain](main_simmim.py 218): INFO Train: [33/200][3000/6787] eta 0:15:55 lr 0.000200 time 0.2539 (0.2523) loss 0.3786 (0.3665) grad_norm 280083.1250 (inf) mem 14543MB +[2023-10-10 19:05:12 simmim_pretrain](main_simmim.py 218): INFO Train: [33/200][3500/6787] eta 0:13:49 lr 0.000200 time 0.2516 (0.2524) loss 0.3966 (0.3665) grad_norm 197832.0156 (inf) mem 14543MB +[2023-10-10 19:07:18 simmim_pretrain](main_simmim.py 218): INFO Train: [33/200][4000/6787] eta 0:11:43 lr 0.000200 time 0.2500 (0.2524) loss 0.3745 (0.3666) grad_norm 349727.8125 (inf) mem 14543MB +[2023-10-10 19:09:24 simmim_pretrain](main_simmim.py 218): INFO Train: [33/200][4500/6787] eta 0:09:37 lr 0.000200 time 0.2461 (0.2524) loss 0.3581 (0.3665) grad_norm 202528.2656 (inf) mem 14543MB +[2023-10-10 19:11:30 simmim_pretrain](main_simmim.py 218): INFO Train: [33/200][5000/6787] eta 0:07:30 lr 0.000200 time 0.2526 (0.2523) loss 0.3579 (0.3665) grad_norm 267166.7188 (inf) mem 14543MB +[2023-10-10 19:13:36 simmim_pretrain](main_simmim.py 218): INFO Train: [33/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2530 (0.2523) loss 0.3607 (0.3666) grad_norm 217122.0938 (inf) mem 14543MB +[2023-10-10 19:15:43 simmim_pretrain](main_simmim.py 218): INFO Train: [33/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2483 (0.2523) loss 0.3606 (0.3666) grad_norm 111676.1406 (inf) mem 14543MB +[2023-10-10 19:17:49 simmim_pretrain](main_simmim.py 218): INFO Train: [33/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2473 (0.2523) loss 0.3408 (0.3666) grad_norm 366012.4688 (inf) mem 14543MB +[2023-10-10 19:19:02 simmim_pretrain](main_simmim.py 228): INFO EPOCH 33 training takes 0:28:33 +[2023-10-10 19:19:03 simmim_pretrain](main_simmim.py 218): INFO Train: [34/200][0/6787] eta 2:42:52 lr 0.000200 time 1.4399 (1.4399) loss 0.3570 (0.3570) grad_norm 304229.9062 (304229.9062) mem 14543MB +[2023-10-10 19:21:09 simmim_pretrain](main_simmim.py 218): INFO Train: [34/200][500/6787] eta 0:26:37 lr 0.000200 time 0.2470 (0.2541) loss 0.3538 (0.3670) grad_norm 203120.2344 (252354.2188) mem 14543MB +[2023-10-10 19:23:16 simmim_pretrain](main_simmim.py 218): INFO Train: [34/200][1000/6787] eta 0:24:30 lr 0.000200 time 0.2552 (0.2541) loss 0.3746 (0.3667) grad_norm 340257.7500 (254896.8750) mem 14543MB +[2023-10-10 19:25:24 simmim_pretrain](main_simmim.py 218): INFO Train: [34/200][1500/6787] eta 0:22:26 lr 0.000200 time 0.2552 (0.2547) loss 0.3709 (0.3661) grad_norm 376421.8438 (274746.2188) mem 14543MB +[2023-10-10 19:27:32 simmim_pretrain](main_simmim.py 218): INFO Train: [34/200][2000/6787] eta 0:20:21 lr 0.000200 time 0.2569 (0.2551) loss 0.3675 (0.3658) grad_norm 192640.7188 (299802.8750) mem 14543MB +[2023-10-10 19:29:40 simmim_pretrain](main_simmim.py 218): INFO Train: [34/200][2500/6787] eta 0:18:14 lr 0.000200 time 0.2553 (0.2554) loss 0.3585 (0.3658) grad_norm 400062.5000 (324304.0938) mem 14543MB +[2023-10-10 19:31:49 simmim_pretrain](main_simmim.py 218): INFO Train: [34/200][3000/6787] eta 0:16:07 lr 0.000200 time 0.2571 (0.2556) loss 0.3634 (0.3656) grad_norm 335525.5312 (inf) mem 14543MB +[2023-10-10 19:33:57 simmim_pretrain](main_simmim.py 218): INFO Train: [34/200][3500/6787] eta 0:14:00 lr 0.000200 time 0.2564 (0.2557) loss 0.3775 (0.3656) grad_norm 247021.4688 (inf) mem 14543MB +[2023-10-10 19:36:05 simmim_pretrain](main_simmim.py 218): INFO Train: [34/200][4000/6787] eta 0:11:52 lr 0.000200 time 0.2551 (0.2557) loss 0.3628 (0.3657) grad_norm 288159.8125 (inf) mem 14543MB +[2023-10-10 19:38:13 simmim_pretrain](main_simmim.py 218): INFO Train: [34/200][4500/6787] eta 0:09:45 lr 0.000200 time 0.2565 (0.2558) loss 0.3805 (0.3658) grad_norm 452970.0000 (inf) mem 14543MB +[2023-10-10 19:40:21 simmim_pretrain](main_simmim.py 218): INFO Train: [34/200][5000/6787] eta 0:07:36 lr 0.000200 time 0.2546 (0.2557) loss 0.3581 (0.3660) grad_norm 311581.0938 (inf) mem 14543MB +[2023-10-10 19:42:27 simmim_pretrain](main_simmim.py 218): INFO Train: [34/200][5500/6787] eta 0:05:28 lr 0.000200 time 0.2464 (0.2554) loss 0.3735 (0.3661) grad_norm 183112.2500 (inf) mem 14543MB +[2023-10-10 19:44:32 simmim_pretrain](main_simmim.py 218): INFO Train: [34/200][6000/6787] eta 0:03:20 lr 0.000200 time 0.2492 (0.2551) loss 0.3578 (0.3661) grad_norm 746149.3125 (inf) mem 14543MB +[2023-10-10 19:46:38 simmim_pretrain](main_simmim.py 218): INFO Train: [34/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2530 (0.2548) loss 0.3643 (0.3660) grad_norm 397526.0000 (inf) mem 14543MB +[2023-10-10 19:47:51 simmim_pretrain](main_simmim.py 228): INFO EPOCH 34 training takes 0:28:49 +[2023-10-10 19:47:52 simmim_pretrain](main_simmim.py 218): INFO Train: [35/200][0/6787] eta 2:27:51 lr 0.000200 time 1.3071 (1.3071) loss 0.3896 (0.3896) grad_norm 319609.0000 (319609.0000) mem 14543MB +[2023-10-10 19:49:58 simmim_pretrain](main_simmim.py 218): INFO Train: [35/200][500/6787] eta 0:26:37 lr 0.000200 time 0.2490 (0.2542) loss 0.3565 (0.3664) grad_norm 272074.5625 (inf) mem 14543MB +[2023-10-10 19:52:05 simmim_pretrain](main_simmim.py 218): INFO Train: [35/200][1000/6787] eta 0:24:24 lr 0.000200 time 0.2475 (0.2532) loss 0.3687 (0.3664) grad_norm 385989.0312 (inf) mem 14543MB +[2023-10-10 19:54:10 simmim_pretrain](main_simmim.py 218): INFO Train: [35/200][1500/6787] eta 0:22:16 lr 0.000200 time 0.2487 (0.2527) loss 0.3684 (0.3666) grad_norm 203559.9531 (inf) mem 14543MB +[2023-10-10 19:56:16 simmim_pretrain](main_simmim.py 218): INFO Train: [35/200][2000/6787] eta 0:20:08 lr 0.000200 time 0.2504 (0.2525) loss 0.3785 (0.3672) grad_norm 112302.0938 (inf) mem 14543MB +[2023-10-10 19:58:22 simmim_pretrain](main_simmim.py 218): INFO Train: [35/200][2500/6787] eta 0:18:01 lr 0.000200 time 0.2531 (0.2524) loss 0.3476 (0.3675) grad_norm 174679.9375 (inf) mem 14543MB +[2023-10-10 20:00:28 simmim_pretrain](main_simmim.py 218): INFO Train: [35/200][3000/6787] eta 0:15:55 lr 0.000200 time 0.2521 (0.2523) loss 0.3589 (0.3677) grad_norm 112545.1719 (inf) mem 14543MB +[2023-10-10 20:02:35 simmim_pretrain](main_simmim.py 218): INFO Train: [35/200][3500/6787] eta 0:13:50 lr 0.000200 time 0.2543 (0.2525) loss 0.3740 (0.3676) grad_norm 273287.6875 (inf) mem 14543MB +[2023-10-10 20:04:43 simmim_pretrain](main_simmim.py 218): INFO Train: [35/200][4000/6787] eta 0:11:44 lr 0.000200 time 0.2547 (0.2529) loss 0.3826 (0.3676) grad_norm 156023.8750 (inf) mem 14543MB +[2023-10-10 20:06:49 simmim_pretrain](main_simmim.py 218): INFO Train: [35/200][4500/6787] eta 0:09:38 lr 0.000200 time 0.2516 (0.2529) loss 0.3852 (0.3674) grad_norm 281602.5312 (inf) mem 14543MB +[2023-10-10 20:08:55 simmim_pretrain](main_simmim.py 218): INFO Train: [35/200][5000/6787] eta 0:07:31 lr 0.000200 time 0.2497 (0.2528) loss 0.3557 (0.3673) grad_norm 237661.0781 (inf) mem 14543MB +[2023-10-10 20:11:03 simmim_pretrain](main_simmim.py 218): INFO Train: [35/200][5500/6787] eta 0:05:25 lr 0.000200 time 0.2606 (0.2530) loss 0.3621 (0.3673) grad_norm 337653.1875 (inf) mem 14543MB +[2023-10-10 20:13:13 simmim_pretrain](main_simmim.py 218): INFO Train: [35/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2552 (0.2536) loss 0.3739 (0.3672) grad_norm 305393.1875 (inf) mem 14543MB +[2023-10-10 20:15:23 simmim_pretrain](main_simmim.py 218): INFO Train: [35/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2607 (0.2541) loss 0.3521 (0.3670) grad_norm 284520.5312 (inf) mem 14543MB +[2023-10-10 20:16:38 simmim_pretrain](main_simmim.py 228): INFO EPOCH 35 training takes 0:28:46 +[2023-10-10 20:16:39 simmim_pretrain](main_simmim.py 218): INFO Train: [36/200][0/6787] eta 2:40:34 lr 0.000200 time 1.4196 (1.4196) loss 0.3630 (0.3630) grad_norm 215011.3750 (215011.3750) mem 14543MB +[2023-10-10 20:18:45 simmim_pretrain](main_simmim.py 218): INFO Train: [36/200][500/6787] eta 0:26:34 lr 0.000200 time 0.2491 (0.2536) loss 0.3545 (0.3661) grad_norm 241011.5156 (inf) mem 14543MB +[2023-10-10 20:20:51 simmim_pretrain](main_simmim.py 218): INFO Train: [36/200][1000/6787] eta 0:24:22 lr 0.000200 time 0.2551 (0.2527) loss 0.3205 (0.3655) grad_norm 138857.3750 (inf) mem 14543MB +[2023-10-10 20:22:57 simmim_pretrain](main_simmim.py 218): INFO Train: [36/200][1500/6787] eta 0:22:16 lr 0.000200 time 0.2472 (0.2527) loss 0.3753 (0.3659) grad_norm 314728.5938 (inf) mem 14543MB +[2023-10-10 20:25:04 simmim_pretrain](main_simmim.py 218): INFO Train: [36/200][2000/6787] eta 0:20:09 lr 0.000200 time 0.2474 (0.2527) loss 0.3612 (0.3660) grad_norm 286663.6875 (inf) mem 14543MB +[2023-10-10 20:27:11 simmim_pretrain](main_simmim.py 218): INFO Train: [36/200][2500/6787] eta 0:18:05 lr 0.000200 time 0.2558 (0.2532) loss 0.3782 (0.3661) grad_norm 289186.4688 (inf) mem 14543MB +[2023-10-10 20:29:19 simmim_pretrain](main_simmim.py 218): INFO Train: [36/200][3000/6787] eta 0:16:00 lr 0.000200 time 0.2545 (0.2536) loss 0.3788 (0.3662) grad_norm 309308.5312 (inf) mem 14543MB +[2023-10-10 20:31:26 simmim_pretrain](main_simmim.py 218): INFO Train: [36/200][3500/6787] eta 0:13:53 lr 0.000200 time 0.2589 (0.2537) loss 0.3573 (0.3663) grad_norm 225424.7969 (inf) mem 14543MB +[2023-10-10 20:33:34 simmim_pretrain](main_simmim.py 218): INFO Train: [36/200][4000/6787] eta 0:11:47 lr 0.000200 time 0.2502 (0.2538) loss 0.3606 (0.3662) grad_norm 141248.1406 (inf) mem 14543MB +[2023-10-10 20:35:41 simmim_pretrain](main_simmim.py 218): INFO Train: [36/200][4500/6787] eta 0:09:40 lr 0.000200 time 0.2538 (0.2538) loss 0.3410 (0.3661) grad_norm 297924.9062 (inf) mem 14543MB +[2023-10-10 20:37:47 simmim_pretrain](main_simmim.py 218): INFO Train: [36/200][5000/6787] eta 0:07:33 lr 0.000200 time 0.2535 (0.2538) loss 0.3575 (0.3660) grad_norm 246211.4062 (inf) mem 14543MB +[2023-10-10 20:39:54 simmim_pretrain](main_simmim.py 218): INFO Train: [36/200][5500/6787] eta 0:05:26 lr 0.000200 time 0.2536 (0.2538) loss 0.3660 (0.3659) grad_norm 340184.7812 (inf) mem 14543MB +[2023-10-10 20:42:00 simmim_pretrain](main_simmim.py 218): INFO Train: [36/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2514 (0.2537) loss 0.3714 (0.3660) grad_norm 321946.5312 (inf) mem 14543MB +[2023-10-10 20:44:06 simmim_pretrain](main_simmim.py 218): INFO Train: [36/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2518 (0.2536) loss 0.3493 (0.3660) grad_norm 235837.4688 (inf) mem 14543MB +[2023-10-10 20:45:19 simmim_pretrain](main_simmim.py 228): INFO EPOCH 36 training takes 0:28:41 +[2023-10-10 20:45:21 simmim_pretrain](main_simmim.py 218): INFO Train: [37/200][0/6787] eta 2:26:49 lr 0.000200 time 1.2980 (1.2980) loss 0.3625 (0.3625) grad_norm 334360.1875 (334360.1875) mem 14543MB +[2023-10-10 20:47:24 simmim_pretrain](main_simmim.py 218): INFO Train: [37/200][500/6787] eta 0:26:10 lr 0.000200 time 0.2461 (0.2498) loss 0.3499 (0.3651) grad_norm 274292.8750 (293740.8125) mem 14543MB +[2023-10-10 20:49:28 simmim_pretrain](main_simmim.py 218): INFO Train: [37/200][1000/6787] eta 0:24:00 lr 0.000200 time 0.2491 (0.2489) loss 0.3947 (0.3654) grad_norm 603381.3750 (327743.5938) mem 14543MB +[2023-10-10 20:51:32 simmim_pretrain](main_simmim.py 218): INFO Train: [37/200][1500/6787] eta 0:21:54 lr 0.000200 time 0.2461 (0.2486) loss 0.3687 (0.3651) grad_norm 321693.0625 (402229.9062) mem 14543MB +[2023-10-10 20:53:37 simmim_pretrain](main_simmim.py 218): INFO Train: [37/200][2000/6787] eta 0:19:49 lr 0.000200 time 0.2478 (0.2485) loss 0.3550 (0.3653) grad_norm 385785.1875 (inf) mem 14543MB +[2023-10-10 20:55:41 simmim_pretrain](main_simmim.py 218): INFO Train: [37/200][2500/6787] eta 0:17:45 lr 0.000200 time 0.2461 (0.2485) loss 0.3841 (0.3652) grad_norm 198693.8906 (inf) mem 14543MB +[2023-10-10 20:57:45 simmim_pretrain](main_simmim.py 218): INFO Train: [37/200][3000/6787] eta 0:15:40 lr 0.000200 time 0.2455 (0.2485) loss 0.3653 (0.3654) grad_norm 251116.9375 (inf) mem 14543MB +[2023-10-10 20:59:49 simmim_pretrain](main_simmim.py 218): INFO Train: [37/200][3500/6787] eta 0:13:36 lr 0.000200 time 0.2531 (0.2484) loss 0.3858 (0.3655) grad_norm 269586.3438 (inf) mem 14543MB +[2023-10-10 21:01:53 simmim_pretrain](main_simmim.py 218): INFO Train: [37/200][4000/6787] eta 0:11:32 lr 0.000200 time 0.2461 (0.2483) loss 0.3715 (0.3656) grad_norm 498564.3750 (inf) mem 14543MB +[2023-10-10 21:03:57 simmim_pretrain](main_simmim.py 218): INFO Train: [37/200][4500/6787] eta 0:09:27 lr 0.000200 time 0.2569 (0.2483) loss 0.3783 (0.3656) grad_norm 189048.2500 (inf) mem 14543MB +[2023-10-10 21:06:01 simmim_pretrain](main_simmim.py 218): INFO Train: [37/200][5000/6787] eta 0:07:23 lr 0.000200 time 0.2458 (0.2483) loss 0.3606 (0.3657) grad_norm 152714.5312 (inf) mem 14543MB +[2023-10-10 21:08:05 simmim_pretrain](main_simmim.py 218): INFO Train: [37/200][5500/6787] eta 0:05:19 lr 0.000200 time 0.2464 (0.2483) loss 0.3796 (0.3657) grad_norm 240211.0469 (inf) mem 14543MB +[2023-10-10 21:10:09 simmim_pretrain](main_simmim.py 218): INFO Train: [37/200][6000/6787] eta 0:03:15 lr 0.000200 time 0.2494 (0.2482) loss 0.3738 (0.3657) grad_norm 311883.7188 (inf) mem 14543MB +[2023-10-10 21:12:13 simmim_pretrain](main_simmim.py 218): INFO Train: [37/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2462 (0.2482) loss 0.3732 (0.3656) grad_norm 403084.1875 (inf) mem 14543MB +[2023-10-10 21:13:25 simmim_pretrain](main_simmim.py 228): INFO EPOCH 37 training takes 0:28:05 +[2023-10-10 21:13:26 simmim_pretrain](main_simmim.py 218): INFO Train: [38/200][0/6787] eta 2:27:03 lr 0.000200 time 1.3001 (1.3001) loss 0.3839 (0.3839) grad_norm 450857.7500 (450857.7500) mem 14543MB +[2023-10-10 21:15:30 simmim_pretrain](main_simmim.py 218): INFO Train: [38/200][500/6787] eta 0:26:12 lr 0.000200 time 0.2457 (0.2501) loss 0.3687 (0.3645) grad_norm 208406.9688 (443340.9375) mem 14543MB +[2023-10-10 21:17:34 simmim_pretrain](main_simmim.py 218): INFO Train: [38/200][1000/6787] eta 0:24:01 lr 0.000200 time 0.2446 (0.2492) loss 0.3754 (0.3651) grad_norm 265558.0312 (inf) mem 14543MB +[2023-10-10 21:19:38 simmim_pretrain](main_simmim.py 218): INFO Train: [38/200][1500/6787] eta 0:21:55 lr 0.000200 time 0.2470 (0.2488) loss 0.3650 (0.3652) grad_norm 279909.9375 (inf) mem 14543MB +[2023-10-10 21:21:42 simmim_pretrain](main_simmim.py 218): INFO Train: [38/200][2000/6787] eta 0:19:50 lr 0.000200 time 0.2539 (0.2486) loss 0.3613 (0.3653) grad_norm 202835.1875 (inf) mem 14543MB +[2023-10-10 21:23:46 simmim_pretrain](main_simmim.py 218): INFO Train: [38/200][2500/6787] eta 0:17:45 lr 0.000200 time 0.2447 (0.2485) loss 0.3575 (0.3654) grad_norm 235722.7188 (inf) mem 14543MB +[2023-10-10 21:25:50 simmim_pretrain](main_simmim.py 218): INFO Train: [38/200][3000/6787] eta 0:15:40 lr 0.000200 time 0.2490 (0.2484) loss 0.3805 (0.3654) grad_norm 307796.7812 (inf) mem 14543MB +[2023-10-10 21:27:54 simmim_pretrain](main_simmim.py 218): INFO Train: [38/200][3500/6787] eta 0:13:36 lr 0.000200 time 0.2456 (0.2483) loss 0.3769 (0.3652) grad_norm 564839.4375 (inf) mem 14543MB +[2023-10-10 21:29:58 simmim_pretrain](main_simmim.py 218): INFO Train: [38/200][4000/6787] eta 0:11:31 lr 0.000200 time 0.2486 (0.2482) loss 0.3585 (0.3651) grad_norm 532107.1250 (inf) mem 14543MB +[2023-10-10 21:32:02 simmim_pretrain](main_simmim.py 218): INFO Train: [38/200][4500/6787] eta 0:09:27 lr 0.000200 time 0.2483 (0.2482) loss 0.3684 (0.3650) grad_norm 534331.6250 (inf) mem 14543MB +[2023-10-10 21:34:06 simmim_pretrain](main_simmim.py 218): INFO Train: [38/200][5000/6787] eta 0:07:23 lr 0.000200 time 0.2515 (0.2482) loss 0.3707 (0.3651) grad_norm 435952.1562 (inf) mem 14543MB +[2023-10-10 21:36:10 simmim_pretrain](main_simmim.py 218): INFO Train: [38/200][5500/6787] eta 0:05:19 lr 0.000200 time 0.2495 (0.2482) loss 0.3884 (0.3651) grad_norm 253426.7812 (inf) mem 14543MB +[2023-10-10 21:38:14 simmim_pretrain](main_simmim.py 218): INFO Train: [38/200][6000/6787] eta 0:03:15 lr 0.000200 time 0.2473 (0.2482) loss 0.3581 (0.3652) grad_norm 226109.5469 (inf) mem 14543MB +[2023-10-10 21:40:18 simmim_pretrain](main_simmim.py 218): INFO Train: [38/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2467 (0.2482) loss 0.3604 (0.3653) grad_norm 631440.5000 (inf) mem 14543MB +[2023-10-10 21:41:30 simmim_pretrain](main_simmim.py 228): INFO EPOCH 38 training takes 0:28:05 +[2023-10-10 21:41:32 simmim_pretrain](main_simmim.py 218): INFO Train: [39/200][0/6787] eta 2:36:16 lr 0.000200 time 1.3816 (1.3816) loss 0.3679 (0.3679) grad_norm 706728.5625 (706728.5625) mem 14543MB +[2023-10-10 21:43:36 simmim_pretrain](main_simmim.py 218): INFO Train: [39/200][500/6787] eta 0:26:13 lr 0.000200 time 0.2490 (0.2503) loss 0.4073 (0.3653) grad_norm 350115.0312 (398354.1875) mem 14543MB +[2023-10-10 21:45:40 simmim_pretrain](main_simmim.py 218): INFO Train: [39/200][1000/6787] eta 0:24:02 lr 0.000200 time 0.2507 (0.2493) loss 0.3873 (0.3652) grad_norm 270967.1875 (414624.0625) mem 14543MB +[2023-10-10 21:47:44 simmim_pretrain](main_simmim.py 218): INFO Train: [39/200][1500/6787] eta 0:21:56 lr 0.000200 time 0.2464 (0.2490) loss 0.3938 (0.3650) grad_norm 643002.1875 (443122.7500) mem 14543MB +[2023-10-10 21:49:48 simmim_pretrain](main_simmim.py 218): INFO Train: [39/200][2000/6787] eta 0:19:50 lr 0.000200 time 0.2453 (0.2488) loss 0.4011 (0.3903) grad_norm 41909.7773 (inf) mem 14543MB +[2023-10-10 21:51:52 simmim_pretrain](main_simmim.py 218): INFO Train: [39/200][2500/6787] eta 0:17:46 lr 0.000200 time 0.2504 (0.2487) loss 0.4061 (0.3891) grad_norm 35817.0938 (inf) mem 14543MB +[2023-10-10 21:53:56 simmim_pretrain](main_simmim.py 218): INFO Train: [39/200][3000/6787] eta 0:15:41 lr 0.000200 time 0.2506 (0.2487) loss 0.3861 (0.3866) grad_norm 29815.2129 (inf) mem 14543MB +[2023-10-10 21:56:01 simmim_pretrain](main_simmim.py 218): INFO Train: [39/200][3500/6787] eta 0:13:37 lr 0.000200 time 0.2464 (0.2486) loss 0.3990 (0.3845) grad_norm 34887.1523 (inf) mem 14543MB +[2023-10-10 21:58:05 simmim_pretrain](main_simmim.py 218): INFO Train: [39/200][4000/6787] eta 0:11:32 lr 0.000200 time 0.2461 (0.2486) loss 0.3736 (0.3826) grad_norm 44092.2148 (inf) mem 14543MB +[2023-10-10 22:00:09 simmim_pretrain](main_simmim.py 218): INFO Train: [39/200][4500/6787] eta 0:09:28 lr 0.000200 time 0.2479 (0.2486) loss 0.3700 (0.3811) grad_norm 100508.8438 (inf) mem 14543MB +[2023-10-10 22:02:13 simmim_pretrain](main_simmim.py 218): INFO Train: [39/200][5000/6787] eta 0:07:24 lr 0.000200 time 0.2494 (0.2485) loss 0.3529 (0.3799) grad_norm 71744.2188 (inf) mem 14543MB +[2023-10-10 22:04:17 simmim_pretrain](main_simmim.py 218): INFO Train: [39/200][5500/6787] eta 0:05:19 lr 0.000200 time 0.2467 (0.2485) loss 0.4027 (0.3788) grad_norm 78603.0625 (inf) mem 14543MB +[2023-10-10 22:06:22 simmim_pretrain](main_simmim.py 218): INFO Train: [39/200][6000/6787] eta 0:03:15 lr 0.000200 time 0.2452 (0.2485) loss 0.3646 (0.3778) grad_norm 118329.6953 (inf) mem 14543MB +[2023-10-10 22:08:26 simmim_pretrain](main_simmim.py 218): INFO Train: [39/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2496 (0.2485) loss 0.3442 (0.3769) grad_norm 115296.9219 (inf) mem 14543MB +[2023-10-10 22:09:37 simmim_pretrain](main_simmim.py 228): INFO EPOCH 39 training takes 0:28:07 +[2023-10-10 22:09:39 simmim_pretrain](main_simmim.py 218): INFO Train: [40/200][0/6787] eta 2:33:26 lr 0.000200 time 1.3565 (1.3565) loss 0.3461 (0.3461) grad_norm 119867.4453 (119867.4453) mem 14543MB +[2023-10-10 22:11:43 simmim_pretrain](main_simmim.py 218): INFO Train: [40/200][500/6787] eta 0:26:13 lr 0.000200 time 0.2498 (0.2503) loss 0.3801 (0.3651) grad_norm 93892.8125 (92858.3359) mem 14543MB +[2023-10-10 22:13:47 simmim_pretrain](main_simmim.py 218): INFO Train: [40/200][1000/6787] eta 0:24:02 lr 0.000200 time 0.2456 (0.2492) loss 0.3660 (0.3657) grad_norm 83616.7109 (104966.8594) mem 14543MB +[2023-10-10 22:15:51 simmim_pretrain](main_simmim.py 218): INFO Train: [40/200][1500/6787] eta 0:21:55 lr 0.000200 time 0.2535 (0.2488) loss 0.3530 (0.3654) grad_norm 86830.6875 (122288.4844) mem 14543MB +[2023-10-10 22:17:55 simmim_pretrain](main_simmim.py 218): INFO Train: [40/200][2000/6787] eta 0:19:50 lr 0.000200 time 0.2460 (0.2487) loss 0.3633 (0.3653) grad_norm 252444.9375 (134365.3750) mem 14543MB +[2023-10-10 22:19:59 simmim_pretrain](main_simmim.py 218): INFO Train: [40/200][2500/6787] eta 0:17:45 lr 0.000200 time 0.2468 (0.2486) loss 0.3802 (0.3653) grad_norm 129132.5625 (146398.0625) mem 14543MB +[2023-10-10 22:22:03 simmim_pretrain](main_simmim.py 218): INFO Train: [40/200][3000/6787] eta 0:15:41 lr 0.000200 time 0.2509 (0.2485) loss 0.3658 (0.3652) grad_norm 243458.6875 (156810.0781) mem 14543MB +[2023-10-10 22:24:07 simmim_pretrain](main_simmim.py 218): INFO Train: [40/200][3500/6787] eta 0:13:36 lr 0.000200 time 0.2470 (0.2484) loss 0.3649 (0.3651) grad_norm 162436.7656 (181904.1094) mem 14543MB +[2023-10-10 22:26:11 simmim_pretrain](main_simmim.py 218): INFO Train: [40/200][4000/6787] eta 0:11:32 lr 0.000200 time 0.2473 (0.2483) loss 0.3626 (0.3649) grad_norm 283710.0312 (194689.8906) mem 14543MB +[2023-10-10 22:28:15 simmim_pretrain](main_simmim.py 218): INFO Train: [40/200][4500/6787] eta 0:09:27 lr 0.000200 time 0.2489 (0.2483) loss 0.3599 (0.3648) grad_norm 224440.3594 (219000.1719) mem 14543MB +[2023-10-10 22:30:19 simmim_pretrain](main_simmim.py 218): INFO Train: [40/200][5000/6787] eta 0:07:23 lr 0.000200 time 0.2485 (0.2483) loss 0.3666 (0.3647) grad_norm 160116.5156 (inf) mem 14543MB +[2023-10-10 22:32:23 simmim_pretrain](main_simmim.py 218): INFO Train: [40/200][5500/6787] eta 0:05:19 lr 0.000200 time 0.2471 (0.2483) loss 0.3772 (0.3647) grad_norm 87426.3594 (inf) mem 14543MB +[2023-10-10 22:34:27 simmim_pretrain](main_simmim.py 218): INFO Train: [40/200][6000/6787] eta 0:03:15 lr 0.000200 time 0.2485 (0.2483) loss 0.3623 (0.3647) grad_norm 110052.3672 (inf) mem 14543MB +[2023-10-10 22:36:32 simmim_pretrain](main_simmim.py 218): INFO Train: [40/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2465 (0.2483) loss 0.3623 (0.3647) grad_norm 254506.9219 (inf) mem 14543MB +[2023-10-10 22:37:44 simmim_pretrain](main_simmim.py 228): INFO EPOCH 40 training takes 0:28:06 +[2023-10-10 22:37:44 simmim_pretrain](utils.py 62): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_40.pth saving...... +[2023-10-10 22:37:44 simmim_pretrain](utils.py 64): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_40.pth saved !!! +[2023-10-10 22:37:46 simmim_pretrain](main_simmim.py 218): INFO Train: [41/200][0/6787] eta 2:31:36 lr 0.000200 time 1.3403 (1.3403) loss 0.3592 (0.3592) grad_norm 335095.7500 (335095.7500) mem 14543MB +[2023-10-10 22:39:50 simmim_pretrain](main_simmim.py 218): INFO Train: [41/200][500/6787] eta 0:26:12 lr 0.000200 time 0.2459 (0.2502) loss 0.3690 (0.3645) grad_norm 261329.5156 (292073.2812) mem 14543MB +[2023-10-10 22:41:54 simmim_pretrain](main_simmim.py 218): INFO Train: [41/200][1000/6787] eta 0:24:03 lr 0.000200 time 0.2466 (0.2494) loss 0.3745 (0.3642) grad_norm 457198.6875 (310417.1875) mem 14543MB +[2023-10-10 22:43:58 simmim_pretrain](main_simmim.py 218): INFO Train: [41/200][1500/6787] eta 0:21:56 lr 0.000200 time 0.2476 (0.2490) loss 0.3607 (0.3638) grad_norm 236496.8594 (361824.3438) mem 14543MB +[2023-10-10 22:46:02 simmim_pretrain](main_simmim.py 218): INFO Train: [41/200][2000/6787] eta 0:19:51 lr 0.000200 time 0.2517 (0.2489) loss 0.3647 (0.3639) grad_norm 450821.4688 (inf) mem 14543MB +[2023-10-10 22:48:06 simmim_pretrain](main_simmim.py 218): INFO Train: [41/200][2500/6787] eta 0:17:46 lr 0.000200 time 0.2515 (0.2488) loss 0.3587 (0.3637) grad_norm 173073.8750 (inf) mem 14543MB +[2023-10-10 22:50:11 simmim_pretrain](main_simmim.py 218): INFO Train: [41/200][3000/6787] eta 0:15:41 lr 0.000200 time 0.2472 (0.2487) loss 0.3792 (0.3637) grad_norm 447053.1250 (inf) mem 14543MB +[2023-10-10 22:52:15 simmim_pretrain](main_simmim.py 218): INFO Train: [41/200][3500/6787] eta 0:13:37 lr 0.000200 time 0.2475 (0.2487) loss 0.3682 (0.3640) grad_norm 194206.7031 (inf) mem 14543MB +[2023-10-10 22:54:19 simmim_pretrain](main_simmim.py 218): INFO Train: [41/200][4000/6787] eta 0:11:33 lr 0.000200 time 0.2503 (0.2487) loss 0.3821 (0.3642) grad_norm 220258.4531 (inf) mem 14543MB +[2023-10-10 22:56:23 simmim_pretrain](main_simmim.py 218): INFO Train: [41/200][4500/6787] eta 0:09:28 lr 0.000200 time 0.2464 (0.2486) loss 0.3655 (0.3643) grad_norm 280326.7500 (inf) mem 14543MB +[2023-10-10 22:58:28 simmim_pretrain](main_simmim.py 218): INFO Train: [41/200][5000/6787] eta 0:07:24 lr 0.000200 time 0.2526 (0.2486) loss 0.3416 (0.3643) grad_norm 149814.2031 (inf) mem 14543MB +[2023-10-10 23:00:32 simmim_pretrain](main_simmim.py 218): INFO Train: [41/200][5500/6787] eta 0:05:19 lr 0.000200 time 0.2469 (0.2486) loss 0.3415 (0.3644) grad_norm 160095.1875 (inf) mem 14543MB +[2023-10-10 23:02:36 simmim_pretrain](main_simmim.py 218): INFO Train: [41/200][6000/6787] eta 0:03:15 lr 0.000200 time 0.2446 (0.2486) loss 0.3627 (0.3643) grad_norm 310503.7188 (inf) mem 14543MB +[2023-10-10 23:04:40 simmim_pretrain](main_simmim.py 218): INFO Train: [41/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2496 (0.2486) loss 0.3556 (0.3642) grad_norm 791738.8750 (inf) mem 14543MB +[2023-10-10 23:05:52 simmim_pretrain](main_simmim.py 228): INFO EPOCH 41 training takes 0:28:07 +[2023-10-10 23:05:53 simmim_pretrain](main_simmim.py 218): INFO Train: [42/200][0/6787] eta 2:32:37 lr 0.000200 time 1.3493 (1.3493) loss 0.3763 (0.3763) grad_norm 356450.0312 (356450.0312) mem 14543MB +[2023-10-10 23:07:57 simmim_pretrain](main_simmim.py 218): INFO Train: [42/200][500/6787] eta 0:26:11 lr 0.000200 time 0.2583 (0.2500) loss 0.3892 (0.3627) grad_norm 429443.6562 (inf) mem 14543MB +[2023-10-10 23:10:01 simmim_pretrain](main_simmim.py 218): INFO Train: [42/200][1000/6787] eta 0:24:00 lr 0.000200 time 0.2465 (0.2489) loss 0.3640 (0.3629) grad_norm 678288.6250 (inf) mem 14543MB +[2023-10-10 23:12:05 simmim_pretrain](main_simmim.py 218): INFO Train: [42/200][1500/6787] eta 0:21:54 lr 0.000200 time 0.2488 (0.2486) loss 0.3653 (0.3633) grad_norm 333081.0312 (inf) mem 14543MB +[2023-10-10 23:14:09 simmim_pretrain](main_simmim.py 218): INFO Train: [42/200][2000/6787] eta 0:19:48 lr 0.000200 time 0.2463 (0.2483) loss 0.3777 (0.3634) grad_norm 360545.2500 (inf) mem 14543MB +[2023-10-10 23:16:13 simmim_pretrain](main_simmim.py 218): INFO Train: [42/200][2500/6787] eta 0:17:44 lr 0.000200 time 0.2475 (0.2482) loss 0.3528 (0.3634) grad_norm 552183.4375 (inf) mem 14543MB +[2023-10-10 23:18:16 simmim_pretrain](main_simmim.py 218): INFO Train: [42/200][3000/6787] eta 0:15:39 lr 0.000200 time 0.2463 (0.2481) loss 0.3798 (0.3634) grad_norm 300832.6875 (inf) mem 14543MB +[2023-10-10 23:20:20 simmim_pretrain](main_simmim.py 218): INFO Train: [42/200][3500/6787] eta 0:13:35 lr 0.000200 time 0.2454 (0.2480) loss 0.3712 (0.3635) grad_norm 233827.8594 (inf) mem 14543MB +[2023-10-10 23:22:24 simmim_pretrain](main_simmim.py 218): INFO Train: [42/200][4000/6787] eta 0:11:31 lr 0.000200 time 0.2486 (0.2480) loss 0.3717 (0.3636) grad_norm 394343.6875 (inf) mem 14543MB +[2023-10-10 23:24:28 simmim_pretrain](main_simmim.py 218): INFO Train: [42/200][4500/6787] eta 0:09:27 lr 0.000200 time 0.2584 (0.2479) loss 0.3503 (0.3637) grad_norm 317775.4688 (inf) mem 14543MB +[2023-10-10 23:26:32 simmim_pretrain](main_simmim.py 218): INFO Train: [42/200][5000/6787] eta 0:07:23 lr 0.000200 time 0.2459 (0.2479) loss 0.3850 (0.3638) grad_norm 239926.7656 (inf) mem 14543MB +[2023-10-10 23:28:36 simmim_pretrain](main_simmim.py 218): INFO Train: [42/200][5500/6787] eta 0:05:19 lr 0.000200 time 0.2522 (0.2479) loss 0.3518 (0.3638) grad_norm 160476.3125 (inf) mem 14543MB +[2023-10-10 23:30:40 simmim_pretrain](main_simmim.py 218): INFO Train: [42/200][6000/6787] eta 0:03:15 lr 0.000200 time 0.2462 (0.2479) loss 0.3756 (0.3638) grad_norm 210586.4531 (inf) mem 14543MB +[2023-10-10 23:32:44 simmim_pretrain](main_simmim.py 218): INFO Train: [42/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2484 (0.2479) loss 0.3865 (0.3638) grad_norm 663934.7500 (inf) mem 14543MB +[2023-10-10 23:33:55 simmim_pretrain](main_simmim.py 228): INFO EPOCH 42 training takes 0:28:03 +[2023-10-10 23:33:57 simmim_pretrain](main_simmim.py 218): INFO Train: [43/200][0/6787] eta 2:17:25 lr 0.000200 time 1.2148 (1.2148) loss 0.3362 (0.3362) grad_norm 664057.2500 (664057.2500) mem 14543MB +[2023-10-10 23:36:01 simmim_pretrain](main_simmim.py 218): INFO Train: [43/200][500/6787] eta 0:26:11 lr 0.000200 time 0.2461 (0.2500) loss 0.3644 (0.3627) grad_norm 370556.8750 (inf) mem 14543MB +[2023-10-10 23:38:05 simmim_pretrain](main_simmim.py 218): INFO Train: [43/200][1000/6787] eta 0:24:01 lr 0.000200 time 0.2528 (0.2491) loss 0.3590 (0.3631) grad_norm 758542.1250 (inf) mem 14543MB +[2023-10-10 23:40:09 simmim_pretrain](main_simmim.py 218): INFO Train: [43/200][1500/6787] eta 0:21:55 lr 0.000200 time 0.2461 (0.2488) loss 0.3608 (0.3637) grad_norm 274564.7812 (inf) mem 14543MB +[2023-10-10 23:42:13 simmim_pretrain](main_simmim.py 218): INFO Train: [43/200][2000/6787] eta 0:19:50 lr 0.000200 time 0.2473 (0.2487) loss 0.3464 (0.3639) grad_norm 267989.1250 (inf) mem 14543MB +[2023-10-10 23:44:17 simmim_pretrain](main_simmim.py 218): INFO Train: [43/200][2500/6787] eta 0:17:45 lr 0.000200 time 0.2460 (0.2486) loss 0.3591 (0.3643) grad_norm 283347.2188 (inf) mem 14543MB +[2023-10-10 23:46:21 simmim_pretrain](main_simmim.py 218): INFO Train: [43/200][3000/6787] eta 0:15:41 lr 0.000200 time 0.2521 (0.2486) loss 0.3408 (0.3643) grad_norm 279111.9688 (inf) mem 14543MB +[2023-10-10 23:48:25 simmim_pretrain](main_simmim.py 218): INFO Train: [43/200][3500/6787] eta 0:13:36 lr 0.000200 time 0.2460 (0.2485) loss 0.3596 (0.3643) grad_norm 640211.0625 (inf) mem 14543MB +[2023-10-10 23:50:30 simmim_pretrain](main_simmim.py 218): INFO Train: [43/200][4000/6787] eta 0:11:32 lr 0.000200 time 0.2504 (0.2485) loss 0.3527 (0.3642) grad_norm 488216.4688 (inf) mem 14543MB +[2023-10-10 23:52:34 simmim_pretrain](main_simmim.py 218): INFO Train: [43/200][4500/6787] eta 0:09:28 lr 0.000200 time 0.2482 (0.2485) loss 0.3690 (0.3639) grad_norm 217386.0469 (inf) mem 14543MB +[2023-10-10 23:54:38 simmim_pretrain](main_simmim.py 218): INFO Train: [43/200][5000/6787] eta 0:07:24 lr 0.000200 time 0.2494 (0.2485) loss 0.3559 (0.3640) grad_norm 340602.0625 (inf) mem 14543MB +[2023-10-10 23:56:42 simmim_pretrain](main_simmim.py 218): INFO Train: [43/200][5500/6787] eta 0:05:19 lr 0.000200 time 0.2445 (0.2485) loss 0.3807 (0.3639) grad_norm 232049.5000 (inf) mem 14543MB +[2023-10-10 23:58:46 simmim_pretrain](main_simmim.py 218): INFO Train: [43/200][6000/6787] eta 0:03:15 lr 0.000200 time 0.2447 (0.2485) loss 0.3534 (0.3639) grad_norm 516933.0312 (inf) mem 14543MB +[2023-10-11 00:00:51 simmim_pretrain](main_simmim.py 218): INFO Train: [43/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2486 (0.2484) loss 0.3595 (0.3639) grad_norm 367178.9062 (inf) mem 14543MB +[2023-10-11 00:02:02 simmim_pretrain](main_simmim.py 228): INFO EPOCH 43 training takes 0:28:06 +[2023-10-11 00:02:04 simmim_pretrain](main_simmim.py 218): INFO Train: [44/200][0/6787] eta 2:32:22 lr 0.000200 time 1.3471 (1.3471) loss 0.3786 (0.3786) grad_norm 544773.5625 (544773.5625) mem 14543MB +[2023-10-11 00:04:07 simmim_pretrain](main_simmim.py 218): INFO Train: [44/200][500/6787] eta 0:26:10 lr 0.000200 time 0.2483 (0.2498) loss 0.3803 (0.3625) grad_norm 584269.8125 (inf) mem 14543MB +[2023-10-11 00:06:11 simmim_pretrain](main_simmim.py 218): INFO Train: [44/200][1000/6787] eta 0:24:00 lr 0.000200 time 0.2501 (0.2489) loss 0.3747 (0.3633) grad_norm 776864.4375 (inf) mem 14543MB +[2023-10-11 00:08:15 simmim_pretrain](main_simmim.py 218): INFO Train: [44/200][1500/6787] eta 0:21:54 lr 0.000200 time 0.2456 (0.2486) loss 0.3549 (0.3631) grad_norm 564549.8125 (inf) mem 14543MB +[2023-10-11 00:10:19 simmim_pretrain](main_simmim.py 218): INFO Train: [44/200][2000/6787] eta 0:19:49 lr 0.000200 time 0.2467 (0.2484) loss 0.3726 (0.3633) grad_norm 294429.7500 (inf) mem 14543MB +[2023-10-11 00:12:23 simmim_pretrain](main_simmim.py 218): INFO Train: [44/200][2500/6787] eta 0:17:44 lr 0.000200 time 0.2458 (0.2483) loss 0.3813 (0.3635) grad_norm 451391.1875 (inf) mem 14543MB +[2023-10-11 00:14:27 simmim_pretrain](main_simmim.py 218): INFO Train: [44/200][3000/6787] eta 0:15:40 lr 0.000200 time 0.2465 (0.2483) loss 0.3803 (0.3638) grad_norm 440069.8750 (inf) mem 14543MB +[2023-10-11 00:16:31 simmim_pretrain](main_simmim.py 218): INFO Train: [44/200][3500/6787] eta 0:13:35 lr 0.000200 time 0.2465 (0.2482) loss 0.3627 (0.3639) grad_norm 181651.0938 (inf) mem 14543MB +[2023-10-11 00:18:35 simmim_pretrain](main_simmim.py 218): INFO Train: [44/200][4000/6787] eta 0:11:31 lr 0.000200 time 0.2459 (0.2482) loss 0.3704 (0.3641) grad_norm 510534.1875 (inf) mem 14543MB +[2023-10-11 00:20:39 simmim_pretrain](main_simmim.py 218): INFO Train: [44/200][4500/6787] eta 0:09:27 lr 0.000200 time 0.2484 (0.2482) loss 0.3561 (0.3640) grad_norm 203330.4219 (inf) mem 14543MB +[2023-10-11 00:22:43 simmim_pretrain](main_simmim.py 218): INFO Train: [44/200][5000/6787] eta 0:07:23 lr 0.000200 time 0.2472 (0.2481) loss 0.3775 (0.3640) grad_norm 382957.3750 (inf) mem 14543MB +[2023-10-11 00:24:47 simmim_pretrain](main_simmim.py 218): INFO Train: [44/200][5500/6787] eta 0:05:19 lr 0.000200 time 0.2517 (0.2481) loss 0.3520 (0.3641) grad_norm 276487.5312 (inf) mem 14543MB +[2023-10-11 00:26:51 simmim_pretrain](main_simmim.py 218): INFO Train: [44/200][6000/6787] eta 0:03:15 lr 0.000200 time 0.2484 (0.2481) loss 0.3767 (0.3642) grad_norm 248337.1406 (inf) mem 14543MB +[2023-10-11 00:28:55 simmim_pretrain](main_simmim.py 218): INFO Train: [44/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2452 (0.2481) loss 0.3934 (0.3642) grad_norm 320632.6562 (inf) mem 14543MB +[2023-10-11 00:30:07 simmim_pretrain](main_simmim.py 228): INFO EPOCH 44 training takes 0:28:04 +[2023-10-11 00:30:08 simmim_pretrain](main_simmim.py 218): INFO Train: [45/200][0/6787] eta 2:57:11 lr 0.000200 time 1.5664 (1.5664) loss 0.3878 (0.3878) grad_norm 163496.7344 (163496.7344) mem 14543MB +[2023-10-11 00:32:13 simmim_pretrain](main_simmim.py 218): INFO Train: [45/200][500/6787] eta 0:26:18 lr 0.000200 time 0.2454 (0.2510) loss 0.3620 (0.3642) grad_norm 379163.0625 (387397.6250) mem 14543MB +[2023-10-11 00:34:17 simmim_pretrain](main_simmim.py 218): INFO Train: [45/200][1000/6787] eta 0:24:07 lr 0.000200 time 0.2478 (0.2502) loss 0.3446 (0.3633) grad_norm 1049138.6250 (387211.0312) mem 14543MB +[2023-10-11 00:36:22 simmim_pretrain](main_simmim.py 218): INFO Train: [45/200][1500/6787] eta 0:22:01 lr 0.000200 time 0.2451 (0.2500) loss 0.3507 (0.3638) grad_norm 250728.8125 (inf) mem 14543MB +[2023-10-11 00:38:27 simmim_pretrain](main_simmim.py 218): INFO Train: [45/200][2000/6787] eta 0:19:55 lr 0.000200 time 0.2516 (0.2498) loss 0.3703 (0.3640) grad_norm 176143.6406 (inf) mem 14543MB +[2023-10-11 00:40:32 simmim_pretrain](main_simmim.py 218): INFO Train: [45/200][2500/6787] eta 0:17:50 lr 0.000200 time 0.2489 (0.2498) loss 0.3568 (0.3643) grad_norm 308656.3125 (inf) mem 14543MB +[2023-10-11 00:42:36 simmim_pretrain](main_simmim.py 218): INFO Train: [45/200][3000/6787] eta 0:15:45 lr 0.000200 time 0.2470 (0.2498) loss 0.3668 (0.3644) grad_norm 361113.4062 (inf) mem 14543MB +[2023-10-11 00:44:41 simmim_pretrain](main_simmim.py 218): INFO Train: [45/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2505 (0.2498) loss 0.3555 (0.3645) grad_norm 256804.1250 (inf) mem 14543MB +[2023-10-11 00:46:47 simmim_pretrain](main_simmim.py 218): INFO Train: [45/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2594 (0.2499) loss 0.3612 (0.3645) grad_norm 216762.2500 (inf) mem 14543MB +[2023-10-11 00:48:52 simmim_pretrain](main_simmim.py 218): INFO Train: [45/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2455 (0.2499) loss 0.3551 (0.3645) grad_norm 350893.2500 (inf) mem 14543MB +[2023-10-11 00:50:56 simmim_pretrain](main_simmim.py 218): INFO Train: [45/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2585 (0.2499) loss 0.3717 (0.3645) grad_norm 427980.7500 (inf) mem 14543MB +[2023-10-11 00:53:01 simmim_pretrain](main_simmim.py 218): INFO Train: [45/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2516 (0.2498) loss 0.3373 (0.3645) grad_norm 338574.9375 (inf) mem 14543MB +[2023-10-11 00:55:06 simmim_pretrain](main_simmim.py 218): INFO Train: [45/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2522 (0.2499) loss 0.3605 (0.3643) grad_norm 401816.5938 (inf) mem 14543MB +[2023-10-11 00:57:11 simmim_pretrain](main_simmim.py 218): INFO Train: [45/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2586 (0.2499) loss 0.3600 (0.3643) grad_norm 627936.7500 (inf) mem 14543MB +[2023-10-11 00:58:23 simmim_pretrain](main_simmim.py 228): INFO EPOCH 45 training takes 0:28:16 +[2023-10-11 00:58:25 simmim_pretrain](main_simmim.py 218): INFO Train: [46/200][0/6787] eta 2:41:29 lr 0.000200 time 1.4277 (1.4277) loss 0.3555 (0.3555) grad_norm 489117.3125 (489117.3125) mem 14543MB +[2023-10-11 01:00:30 simmim_pretrain](main_simmim.py 218): INFO Train: [46/200][500/6787] eta 0:26:30 lr 0.000200 time 0.2555 (0.2529) loss 0.3367 (0.3634) grad_norm 723940.0625 (inf) mem 14543MB +[2023-10-11 01:02:36 simmim_pretrain](main_simmim.py 218): INFO Train: [46/200][1000/6787] eta 0:24:20 lr 0.000200 time 0.2505 (0.2523) loss 0.3571 (0.3634) grad_norm 697757.7500 (inf) mem 14543MB +[2023-10-11 01:04:42 simmim_pretrain](main_simmim.py 218): INFO Train: [46/200][1500/6787] eta 0:22:13 lr 0.000200 time 0.2523 (0.2522) loss 0.3778 (0.3630) grad_norm 548565.5000 (inf) mem 14543MB +[2023-10-11 01:06:48 simmim_pretrain](main_simmim.py 218): INFO Train: [46/200][2000/6787] eta 0:20:06 lr 0.000200 time 0.2525 (0.2521) loss 0.3641 (0.3631) grad_norm 478167.4062 (inf) mem 14543MB +[2023-10-11 01:08:54 simmim_pretrain](main_simmim.py 218): INFO Train: [46/200][2500/6787] eta 0:18:00 lr 0.000200 time 0.2524 (0.2520) loss 0.3573 (0.3627) grad_norm 481278.5938 (inf) mem 14543MB +[2023-10-11 01:11:00 simmim_pretrain](main_simmim.py 218): INFO Train: [46/200][3000/6787] eta 0:15:54 lr 0.000200 time 0.2554 (0.2520) loss 0.3284 (0.3628) grad_norm 598187.1250 (inf) mem 14543MB +[2023-10-11 01:13:06 simmim_pretrain](main_simmim.py 218): INFO Train: [46/200][3500/6787] eta 0:13:48 lr 0.000200 time 0.2514 (0.2519) loss 0.3649 (0.3627) grad_norm 466964.3125 (inf) mem 14543MB +[2023-10-11 01:15:11 simmim_pretrain](main_simmim.py 218): INFO Train: [46/200][4000/6787] eta 0:11:42 lr 0.000200 time 0.2506 (0.2519) loss 0.3532 (0.3629) grad_norm 341306.7812 (inf) mem 14543MB +[2023-10-11 01:17:17 simmim_pretrain](main_simmim.py 218): INFO Train: [46/200][4500/6787] eta 0:09:35 lr 0.000200 time 0.2500 (0.2518) loss 0.5133 (0.3670) grad_norm 8192.9541 (inf) mem 14543MB +[2023-10-11 01:19:23 simmim_pretrain](main_simmim.py 218): INFO Train: [46/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2492 (0.2518) loss 0.3916 (0.3736) grad_norm 35077.4961 (inf) mem 14543MB +[2023-10-11 01:21:28 simmim_pretrain](main_simmim.py 218): INFO Train: [46/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2439 (0.2517) loss 0.3743 (0.3739) grad_norm 31794.4277 (inf) mem 14543MB +[2023-10-11 01:23:34 simmim_pretrain](main_simmim.py 218): INFO Train: [46/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2500 (0.2517) loss 0.3786 (0.3737) grad_norm 40885.3125 (inf) mem 14543MB +[2023-10-11 01:25:39 simmim_pretrain](main_simmim.py 218): INFO Train: [46/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2510 (0.2516) loss 0.3573 (0.3736) grad_norm 29708.0996 (inf) mem 14543MB +[2023-10-11 01:26:52 simmim_pretrain](main_simmim.py 228): INFO EPOCH 46 training takes 0:28:28 +[2023-10-11 01:26:53 simmim_pretrain](main_simmim.py 218): INFO Train: [47/200][0/6787] eta 2:56:28 lr 0.000200 time 1.5602 (1.5602) loss 0.3808 (0.3808) grad_norm 27461.1562 (27461.1562) mem 14543MB +[2023-10-11 01:28:58 simmim_pretrain](main_simmim.py 218): INFO Train: [47/200][500/6787] eta 0:26:22 lr 0.000200 time 0.2512 (0.2517) loss 0.3734 (0.3680) grad_norm 69219.5391 (48537.2734) mem 14543MB +[2023-10-11 01:31:03 simmim_pretrain](main_simmim.py 218): INFO Train: [47/200][1000/6787] eta 0:24:10 lr 0.000200 time 0.2467 (0.2507) loss 0.3630 (0.3677) grad_norm 50439.2930 (50515.4961) mem 14543MB +[2023-10-11 01:33:08 simmim_pretrain](main_simmim.py 218): INFO Train: [47/200][1500/6787] eta 0:22:03 lr 0.000200 time 0.2496 (0.2503) loss 0.3663 (0.3677) grad_norm 78190.7188 (51518.5156) mem 14543MB +[2023-10-11 01:35:12 simmim_pretrain](main_simmim.py 218): INFO Train: [47/200][2000/6787] eta 0:19:57 lr 0.000200 time 0.2471 (0.2501) loss 0.3632 (0.3675) grad_norm 59504.5820 (55988.7344) mem 14543MB +[2023-10-11 01:37:17 simmim_pretrain](main_simmim.py 218): INFO Train: [47/200][2500/6787] eta 0:17:52 lr 0.000200 time 0.2480 (0.2501) loss 0.3664 (0.3670) grad_norm 82650.3438 (62042.2109) mem 14543MB +[2023-10-11 01:39:22 simmim_pretrain](main_simmim.py 218): INFO Train: [47/200][3000/6787] eta 0:15:46 lr 0.000200 time 0.2589 (0.2500) loss 0.3406 (0.3667) grad_norm 113726.7969 (71312.3906) mem 14543MB +[2023-10-11 01:41:27 simmim_pretrain](main_simmim.py 218): INFO Train: [47/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2461 (0.2500) loss 0.3812 (0.3665) grad_norm 113979.7656 (75201.1094) mem 14543MB +[2023-10-11 01:43:32 simmim_pretrain](main_simmim.py 218): INFO Train: [47/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2463 (0.2500) loss 0.3696 (0.3662) grad_norm 273631.8750 (85130.3906) mem 14543MB +[2023-10-11 01:45:37 simmim_pretrain](main_simmim.py 218): INFO Train: [47/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2534 (0.2500) loss 0.3499 (0.3660) grad_norm 114111.0938 (94094.6016) mem 14543MB +[2023-10-11 01:47:42 simmim_pretrain](main_simmim.py 218): INFO Train: [47/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2548 (0.2500) loss 0.3617 (0.3658) grad_norm 345186.6562 (102983.7734) mem 14543MB +[2023-10-11 01:49:47 simmim_pretrain](main_simmim.py 218): INFO Train: [47/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2465 (0.2499) loss 0.3850 (0.3657) grad_norm 280478.8125 (111281.3984) mem 14543MB +[2023-10-11 01:51:52 simmim_pretrain](main_simmim.py 218): INFO Train: [47/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2547 (0.2499) loss 0.3660 (0.3654) grad_norm 279369.4688 (122865.2031) mem 14543MB +[2023-10-11 01:53:56 simmim_pretrain](main_simmim.py 218): INFO Train: [47/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2494 (0.2499) loss 0.3787 (0.3652) grad_norm 465164.7812 (144529.6719) mem 14543MB +[2023-10-11 01:55:09 simmim_pretrain](main_simmim.py 228): INFO EPOCH 47 training takes 0:28:16 +[2023-10-11 01:55:10 simmim_pretrain](main_simmim.py 218): INFO Train: [48/200][0/6787] eta 2:55:42 lr 0.000200 time 1.5534 (1.5534) loss 0.3367 (0.3367) grad_norm 270580.6875 (270580.6875) mem 14543MB +[2023-10-11 01:57:15 simmim_pretrain](main_simmim.py 218): INFO Train: [48/200][500/6787] eta 0:26:24 lr 0.000200 time 0.2488 (0.2520) loss 0.3529 (0.3638) grad_norm 119517.2656 (233062.7812) mem 14543MB +[2023-10-11 01:59:20 simmim_pretrain](main_simmim.py 218): INFO Train: [48/200][1000/6787] eta 0:24:11 lr 0.000200 time 0.2472 (0.2507) loss 0.3352 (0.3642) grad_norm 247407.5938 (225968.4844) mem 14543MB +[2023-10-11 02:01:24 simmim_pretrain](main_simmim.py 218): INFO Train: [48/200][1500/6787] eta 0:22:03 lr 0.000200 time 0.2517 (0.2504) loss 0.3599 (0.3641) grad_norm 218541.5938 (223707.5938) mem 14543MB +[2023-10-11 02:03:29 simmim_pretrain](main_simmim.py 218): INFO Train: [48/200][2000/6787] eta 0:19:57 lr 0.000200 time 0.2473 (0.2501) loss 0.3621 (0.3640) grad_norm 179888.3750 (225409.7344) mem 14543MB +[2023-10-11 02:05:34 simmim_pretrain](main_simmim.py 218): INFO Train: [48/200][2500/6787] eta 0:17:51 lr 0.000200 time 0.2461 (0.2499) loss 0.3708 (0.3637) grad_norm 216551.4531 (245487.7969) mem 14543MB +[2023-10-11 02:07:39 simmim_pretrain](main_simmim.py 218): INFO Train: [48/200][3000/6787] eta 0:15:46 lr 0.000200 time 0.2525 (0.2499) loss 0.3801 (0.3637) grad_norm 178786.7500 (inf) mem 14543MB +[2023-10-11 02:09:43 simmim_pretrain](main_simmim.py 218): INFO Train: [48/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2508 (0.2498) loss 0.3703 (0.3636) grad_norm 157914.2812 (inf) mem 14543MB +[2023-10-11 02:11:48 simmim_pretrain](main_simmim.py 218): INFO Train: [48/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2535 (0.2498) loss 0.3606 (0.3637) grad_norm 310359.0938 (inf) mem 14543MB +[2023-10-11 02:13:53 simmim_pretrain](main_simmim.py 218): INFO Train: [48/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2473 (0.2497) loss 0.3440 (0.3637) grad_norm 220890.3125 (inf) mem 14543MB +[2023-10-11 02:15:57 simmim_pretrain](main_simmim.py 218): INFO Train: [48/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2489 (0.2497) loss 0.3589 (0.3637) grad_norm 276843.0625 (inf) mem 14543MB +[2023-10-11 02:18:02 simmim_pretrain](main_simmim.py 218): INFO Train: [48/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2513 (0.2497) loss 0.3608 (0.3636) grad_norm 451188.0000 (inf) mem 14543MB +[2023-10-11 02:20:07 simmim_pretrain](main_simmim.py 218): INFO Train: [48/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2519 (0.2497) loss 0.3770 (0.3635) grad_norm 232435.2344 (inf) mem 14543MB +[2023-10-11 02:22:12 simmim_pretrain](main_simmim.py 218): INFO Train: [48/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2554 (0.2497) loss 0.3626 (0.3636) grad_norm 345978.5312 (inf) mem 14543MB +[2023-10-11 02:23:24 simmim_pretrain](main_simmim.py 228): INFO EPOCH 48 training takes 0:28:15 +[2023-10-11 02:23:25 simmim_pretrain](main_simmim.py 218): INFO Train: [49/200][0/6787] eta 2:57:03 lr 0.000200 time 1.5652 (1.5652) loss 0.3558 (0.3558) grad_norm 467449.9375 (467449.9375) mem 14543MB +[2023-10-11 02:25:30 simmim_pretrain](main_simmim.py 218): INFO Train: [49/200][500/6787] eta 0:26:22 lr 0.000200 time 0.2486 (0.2518) loss 0.3571 (0.3626) grad_norm 97589.9141 (inf) mem 14543MB +[2023-10-11 02:27:35 simmim_pretrain](main_simmim.py 218): INFO Train: [49/200][1000/6787] eta 0:24:10 lr 0.000200 time 0.2478 (0.2507) loss 0.3597 (0.3634) grad_norm 234085.4688 (inf) mem 14543MB +[2023-10-11 02:29:40 simmim_pretrain](main_simmim.py 218): INFO Train: [49/200][1500/6787] eta 0:22:03 lr 0.000200 time 0.2478 (0.2504) loss 0.3727 (0.3637) grad_norm 209717.7812 (inf) mem 14543MB +[2023-10-11 02:31:45 simmim_pretrain](main_simmim.py 218): INFO Train: [49/200][2000/6787] eta 0:19:58 lr 0.000200 time 0.2473 (0.2503) loss 0.3841 (0.3640) grad_norm 149170.3125 (inf) mem 14543MB +[2023-10-11 02:33:50 simmim_pretrain](main_simmim.py 218): INFO Train: [49/200][2500/6787] eta 0:17:52 lr 0.000200 time 0.2508 (0.2502) loss 0.3671 (0.3640) grad_norm 304377.5938 (inf) mem 14543MB +[2023-10-11 02:35:54 simmim_pretrain](main_simmim.py 218): INFO Train: [49/200][3000/6787] eta 0:15:47 lr 0.000200 time 0.2500 (0.2501) loss 0.3893 (0.3637) grad_norm 431888.3438 (inf) mem 14543MB +[2023-10-11 02:37:59 simmim_pretrain](main_simmim.py 218): INFO Train: [49/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2553 (0.2501) loss 0.3362 (0.3633) grad_norm 448759.0625 (inf) mem 14543MB +[2023-10-11 02:40:04 simmim_pretrain](main_simmim.py 218): INFO Train: [49/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2502 (0.2500) loss 0.3391 (0.3632) grad_norm 308764.1250 (inf) mem 14543MB +[2023-10-11 02:42:09 simmim_pretrain](main_simmim.py 218): INFO Train: [49/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2470 (0.2499) loss 0.3638 (0.3632) grad_norm 407801.9062 (inf) mem 14543MB +[2023-10-11 02:44:14 simmim_pretrain](main_simmim.py 218): INFO Train: [49/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2511 (0.2499) loss 0.3914 (0.3631) grad_norm 403336.8750 (inf) mem 14543MB +[2023-10-11 02:46:19 simmim_pretrain](main_simmim.py 218): INFO Train: [49/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2473 (0.2499) loss 0.3771 (0.3630) grad_norm 286049.4688 (inf) mem 14543MB +[2023-10-11 02:48:24 simmim_pretrain](main_simmim.py 218): INFO Train: [49/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2479 (0.2499) loss 0.3578 (0.3630) grad_norm 222491.9062 (inf) mem 14543MB +[2023-10-11 02:50:29 simmim_pretrain](main_simmim.py 218): INFO Train: [49/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2478 (0.2499) loss 0.3450 (0.3631) grad_norm 95039.0859 (inf) mem 14543MB +[2023-10-11 02:51:41 simmim_pretrain](main_simmim.py 228): INFO EPOCH 49 training takes 0:28:17 +[2023-10-11 02:51:42 simmim_pretrain](main_simmim.py 218): INFO Train: [50/200][0/6787] eta 2:40:53 lr 0.000200 time 1.4224 (1.4224) loss 0.3475 (0.3475) grad_norm 280531.1562 (280531.1562) mem 14543MB +[2023-10-11 02:53:47 simmim_pretrain](main_simmim.py 218): INFO Train: [50/200][500/6787] eta 0:26:21 lr 0.000200 time 0.2501 (0.2516) loss 0.3760 (0.3630) grad_norm 211297.5469 (253206.5469) mem 14543MB +[2023-10-11 02:55:52 simmim_pretrain](main_simmim.py 218): INFO Train: [50/200][1000/6787] eta 0:24:10 lr 0.000200 time 0.2508 (0.2507) loss 0.3780 (0.3631) grad_norm 234479.2031 (256044.3281) mem 14543MB +[2023-10-11 02:57:57 simmim_pretrain](main_simmim.py 218): INFO Train: [50/200][1500/6787] eta 0:22:03 lr 0.000200 time 0.2462 (0.2503) loss 0.3833 (0.3632) grad_norm 309001.9688 (278049.4375) mem 14543MB +[2023-10-11 03:00:01 simmim_pretrain](main_simmim.py 218): INFO Train: [50/200][2000/6787] eta 0:19:57 lr 0.000200 time 0.2495 (0.2501) loss 0.3651 (0.3630) grad_norm 214604.1250 (inf) mem 14543MB +[2023-10-11 03:02:06 simmim_pretrain](main_simmim.py 218): INFO Train: [50/200][2500/6787] eta 0:17:51 lr 0.000200 time 0.2506 (0.2499) loss 0.3601 (0.3632) grad_norm 202331.9219 (inf) mem 14543MB +[2023-10-11 03:04:11 simmim_pretrain](main_simmim.py 218): INFO Train: [50/200][3000/6787] eta 0:15:46 lr 0.000200 time 0.2470 (0.2499) loss 0.3670 (0.3633) grad_norm 325879.5312 (inf) mem 14543MB +[2023-10-11 03:06:16 simmim_pretrain](main_simmim.py 218): INFO Train: [50/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2446 (0.2499) loss 0.3825 (0.3633) grad_norm 153282.1875 (inf) mem 14543MB +[2023-10-11 03:08:20 simmim_pretrain](main_simmim.py 218): INFO Train: [50/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2458 (0.2498) loss 0.3921 (0.3633) grad_norm 334027.1562 (inf) mem 14543MB +[2023-10-11 03:10:25 simmim_pretrain](main_simmim.py 218): INFO Train: [50/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2496 (0.2497) loss 0.3496 (0.3632) grad_norm 610268.6875 (inf) mem 14543MB +[2023-10-11 03:12:30 simmim_pretrain](main_simmim.py 218): INFO Train: [50/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2576 (0.2497) loss 0.3593 (0.3631) grad_norm 562246.0625 (inf) mem 14543MB +[2023-10-11 03:14:34 simmim_pretrain](main_simmim.py 218): INFO Train: [50/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2496 (0.2497) loss 0.3518 (0.3630) grad_norm 339417.7812 (inf) mem 14543MB +[2023-10-11 03:16:39 simmim_pretrain](main_simmim.py 218): INFO Train: [50/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2490 (0.2496) loss 0.3451 (0.3629) grad_norm 502384.1250 (inf) mem 14543MB +[2023-10-11 03:18:44 simmim_pretrain](main_simmim.py 218): INFO Train: [50/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2500 (0.2496) loss 0.3641 (0.3629) grad_norm 936886.3125 (inf) mem 14543MB +[2023-10-11 03:19:56 simmim_pretrain](main_simmim.py 228): INFO EPOCH 50 training takes 0:28:14 +[2023-10-11 03:19:57 simmim_pretrain](main_simmim.py 218): INFO Train: [51/200][0/6787] eta 2:43:51 lr 0.000200 time 1.4486 (1.4486) loss 0.3680 (0.3680) grad_norm 329809.7812 (329809.7812) mem 14543MB +[2023-10-11 03:22:01 simmim_pretrain](main_simmim.py 218): INFO Train: [51/200][500/6787] eta 0:26:19 lr 0.000200 time 0.2596 (0.2512) loss 0.3668 (0.3619) grad_norm 560819.8125 (451228.8438) mem 14543MB +[2023-10-11 03:24:06 simmim_pretrain](main_simmim.py 218): INFO Train: [51/200][1000/6787] eta 0:24:07 lr 0.000200 time 0.2457 (0.2502) loss 0.3689 (0.3624) grad_norm 610100.0625 (453724.4062) mem 14543MB +[2023-10-11 03:26:11 simmim_pretrain](main_simmim.py 218): INFO Train: [51/200][1500/6787] eta 0:22:01 lr 0.000200 time 0.2473 (0.2500) loss 0.3591 (0.3626) grad_norm 563284.8750 (inf) mem 14543MB +[2023-10-11 03:28:16 simmim_pretrain](main_simmim.py 218): INFO Train: [51/200][2000/6787] eta 0:19:56 lr 0.000200 time 0.2569 (0.2499) loss 0.3576 (0.3625) grad_norm 374223.4375 (inf) mem 14543MB +[2023-10-11 03:30:21 simmim_pretrain](main_simmim.py 218): INFO Train: [51/200][2500/6787] eta 0:17:51 lr 0.000200 time 0.2482 (0.2499) loss 0.3655 (0.3626) grad_norm 427651.9375 (inf) mem 14543MB +[2023-10-11 03:32:25 simmim_pretrain](main_simmim.py 218): INFO Train: [51/200][3000/6787] eta 0:15:46 lr 0.000200 time 0.2514 (0.2499) loss 0.3494 (0.3625) grad_norm 460327.7500 (inf) mem 14543MB +[2023-10-11 03:34:30 simmim_pretrain](main_simmim.py 218): INFO Train: [51/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2503 (0.2498) loss 0.3583 (0.3623) grad_norm 646937.8750 (inf) mem 14543MB +[2023-10-11 03:36:35 simmim_pretrain](main_simmim.py 218): INFO Train: [51/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2524 (0.2498) loss 0.3729 (0.3623) grad_norm 815029.9375 (inf) mem 14543MB +[2023-10-11 03:38:40 simmim_pretrain](main_simmim.py 218): INFO Train: [51/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2481 (0.2498) loss 0.3631 (0.3623) grad_norm 333791.5312 (inf) mem 14543MB +[2023-10-11 03:40:45 simmim_pretrain](main_simmim.py 218): INFO Train: [51/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2532 (0.2498) loss 0.3633 (0.3624) grad_norm 566627.5625 (inf) mem 14543MB +[2023-10-11 03:42:50 simmim_pretrain](main_simmim.py 218): INFO Train: [51/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2463 (0.2499) loss 0.3683 (0.3624) grad_norm 473159.7500 (inf) mem 14543MB +[2023-10-11 03:44:55 simmim_pretrain](main_simmim.py 218): INFO Train: [51/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2470 (0.2499) loss 0.3584 (0.3625) grad_norm 339222.2812 (inf) mem 14543MB +[2023-10-11 03:47:00 simmim_pretrain](main_simmim.py 218): INFO Train: [51/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2525 (0.2499) loss 0.3605 (0.3625) grad_norm 322059.4375 (inf) mem 14543MB +[2023-10-11 03:48:12 simmim_pretrain](main_simmim.py 228): INFO EPOCH 51 training takes 0:28:16 +[2023-10-11 03:48:14 simmim_pretrain](main_simmim.py 218): INFO Train: [52/200][0/6787] eta 2:54:09 lr 0.000200 time 1.5397 (1.5397) loss 0.3724 (0.3724) grad_norm 338129.8750 (338129.8750) mem 14543MB +[2023-10-11 03:50:19 simmim_pretrain](main_simmim.py 218): INFO Train: [52/200][500/6787] eta 0:26:22 lr 0.000200 time 0.2464 (0.2517) loss 0.3580 (0.3647) grad_norm 343022.4375 (311183.4688) mem 14543MB +[2023-10-11 03:52:24 simmim_pretrain](main_simmim.py 218): INFO Train: [52/200][1000/6787] eta 0:24:11 lr 0.000200 time 0.2465 (0.2508) loss 0.3479 (0.3645) grad_norm 200447.3281 (298474.0312) mem 14543MB +[2023-10-11 03:54:28 simmim_pretrain](main_simmim.py 218): INFO Train: [52/200][1500/6787] eta 0:22:04 lr 0.000200 time 0.2545 (0.2505) loss 0.3681 (0.3643) grad_norm 349441.0938 (290169.6875) mem 14543MB +[2023-10-11 03:56:33 simmim_pretrain](main_simmim.py 218): INFO Train: [52/200][2000/6787] eta 0:19:58 lr 0.000200 time 0.2500 (0.2503) loss 0.3560 (0.3641) grad_norm 262786.4375 (302799.0625) mem 14543MB +[2023-10-11 03:58:38 simmim_pretrain](main_simmim.py 218): INFO Train: [52/200][2500/6787] eta 0:17:52 lr 0.000200 time 0.2483 (0.2502) loss 0.3666 (0.3636) grad_norm 584927.7500 (336663.4062) mem 14543MB +[2023-10-11 04:00:43 simmim_pretrain](main_simmim.py 218): INFO Train: [52/200][3000/6787] eta 0:15:47 lr 0.000200 time 0.2468 (0.2501) loss 0.3611 (0.3634) grad_norm 346435.5938 (inf) mem 14543MB +[2023-10-11 04:02:48 simmim_pretrain](main_simmim.py 218): INFO Train: [52/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2502 (0.2500) loss 0.3731 (0.3636) grad_norm 231555.1875 (inf) mem 14543MB +[2023-10-11 04:04:53 simmim_pretrain](main_simmim.py 218): INFO Train: [52/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2479 (0.2500) loss 0.3646 (0.3636) grad_norm 329689.0625 (inf) mem 14543MB +[2023-10-11 04:06:57 simmim_pretrain](main_simmim.py 218): INFO Train: [52/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2516 (0.2499) loss 0.3608 (0.3636) grad_norm 219905.7344 (inf) mem 14543MB +[2023-10-11 04:09:02 simmim_pretrain](main_simmim.py 218): INFO Train: [52/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2467 (0.2498) loss 0.3618 (0.3636) grad_norm 214960.0938 (inf) mem 14543MB +[2023-10-11 04:11:07 simmim_pretrain](main_simmim.py 218): INFO Train: [52/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2464 (0.2498) loss 0.3635 (0.3635) grad_norm 521343.5938 (inf) mem 14543MB +[2023-10-11 04:13:11 simmim_pretrain](main_simmim.py 218): INFO Train: [52/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2492 (0.2497) loss 0.3731 (0.3635) grad_norm 264869.9375 (inf) mem 14543MB +[2023-10-11 04:15:16 simmim_pretrain](main_simmim.py 218): INFO Train: [52/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2502 (0.2497) loss 0.3503 (0.3634) grad_norm 493943.5000 (inf) mem 14543MB +[2023-10-11 04:16:28 simmim_pretrain](main_simmim.py 228): INFO EPOCH 52 training takes 0:28:15 +[2023-10-11 04:16:29 simmim_pretrain](main_simmim.py 218): INFO Train: [53/200][0/6787] eta 2:39:04 lr 0.000200 time 1.4063 (1.4063) loss 0.3741 (0.3741) grad_norm 189475.5312 (189475.5312) mem 14543MB +[2023-10-11 04:18:34 simmim_pretrain](main_simmim.py 218): INFO Train: [53/200][500/6787] eta 0:26:18 lr 0.000200 time 0.2534 (0.2511) loss 0.3836 (0.3623) grad_norm 406536.2812 (inf) mem 14543MB +[2023-10-11 04:20:39 simmim_pretrain](main_simmim.py 218): INFO Train: [53/200][1000/6787] eta 0:24:08 lr 0.000200 time 0.2491 (0.2504) loss 0.3574 (0.3623) grad_norm 598655.6875 (inf) mem 14543MB +[2023-10-11 04:22:43 simmim_pretrain](main_simmim.py 218): INFO Train: [53/200][1500/6787] eta 0:22:01 lr 0.000200 time 0.2523 (0.2500) loss 0.3794 (0.3627) grad_norm 352266.4375 (inf) mem 14543MB +[2023-10-11 04:24:48 simmim_pretrain](main_simmim.py 218): INFO Train: [53/200][2000/6787] eta 0:19:55 lr 0.000200 time 0.2589 (0.2498) loss 0.3681 (0.3630) grad_norm 229203.3125 (inf) mem 14543MB +[2023-10-11 04:26:52 simmim_pretrain](main_simmim.py 218): INFO Train: [53/200][2500/6787] eta 0:17:50 lr 0.000200 time 0.2457 (0.2497) loss 0.3639 (0.3632) grad_norm 428634.5938 (inf) mem 14543MB +[2023-10-11 04:28:57 simmim_pretrain](main_simmim.py 218): INFO Train: [53/200][3000/6787] eta 0:15:45 lr 0.000200 time 0.2469 (0.2497) loss 0.3580 (0.3634) grad_norm 299912.7188 (inf) mem 14543MB +[2023-10-11 04:31:02 simmim_pretrain](main_simmim.py 218): INFO Train: [53/200][3500/6787] eta 0:13:40 lr 0.000200 time 0.2511 (0.2496) loss 0.3597 (0.3634) grad_norm 258500.3594 (inf) mem 14543MB +[2023-10-11 04:33:07 simmim_pretrain](main_simmim.py 218): INFO Train: [53/200][4000/6787] eta 0:11:35 lr 0.000200 time 0.2462 (0.2496) loss 0.3689 (0.3636) grad_norm 188816.4219 (inf) mem 14543MB +[2023-10-11 04:35:11 simmim_pretrain](main_simmim.py 218): INFO Train: [53/200][4500/6787] eta 0:09:30 lr 0.000200 time 0.2465 (0.2496) loss 0.3430 (0.3637) grad_norm 365742.2812 (inf) mem 14543MB +[2023-10-11 04:37:16 simmim_pretrain](main_simmim.py 218): INFO Train: [53/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2496 (0.2496) loss 0.3647 (0.3637) grad_norm 323241.8125 (inf) mem 14543MB +[2023-10-11 04:39:21 simmim_pretrain](main_simmim.py 218): INFO Train: [53/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2494 (0.2496) loss 0.3451 (0.3637) grad_norm 396808.9375 (inf) mem 14543MB +[2023-10-11 04:41:26 simmim_pretrain](main_simmim.py 218): INFO Train: [53/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2511 (0.2496) loss 0.3568 (0.3636) grad_norm 268041.6250 (inf) mem 14543MB +[2023-10-11 04:43:31 simmim_pretrain](main_simmim.py 218): INFO Train: [53/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2497 (0.2496) loss 0.3724 (0.3637) grad_norm 307951.5312 (inf) mem 14543MB +[2023-10-11 04:44:43 simmim_pretrain](main_simmim.py 228): INFO EPOCH 53 training takes 0:28:15 +[2023-10-11 04:44:44 simmim_pretrain](main_simmim.py 218): INFO Train: [54/200][0/6787] eta 2:43:16 lr 0.000200 time 1.4434 (1.4434) loss 0.3816 (0.3816) grad_norm 135215.5000 (135215.5000) mem 14543MB +[2023-10-11 04:46:49 simmim_pretrain](main_simmim.py 218): INFO Train: [54/200][500/6787] eta 0:26:26 lr 0.000200 time 0.2496 (0.2524) loss 0.3786 (0.3637) grad_norm 204125.5156 (290804.1875) mem 14543MB +[2023-10-11 04:48:55 simmim_pretrain](main_simmim.py 218): INFO Train: [54/200][1000/6787] eta 0:24:14 lr 0.000200 time 0.2466 (0.2513) loss 0.3555 (0.3640) grad_norm 337903.7188 (292813.8750) mem 14543MB +[2023-10-11 04:51:00 simmim_pretrain](main_simmim.py 218): INFO Train: [54/200][1500/6787] eta 0:22:06 lr 0.000200 time 0.2474 (0.2508) loss 0.3638 (0.3639) grad_norm 538586.5000 (307389.1875) mem 14543MB +[2023-10-11 04:53:05 simmim_pretrain](main_simmim.py 218): INFO Train: [54/200][2000/6787] eta 0:19:59 lr 0.000200 time 0.2501 (0.2506) loss 0.3684 (0.3638) grad_norm 573000.6875 (338029.5000) mem 14543MB +[2023-10-11 04:55:10 simmim_pretrain](main_simmim.py 218): INFO Train: [54/200][2500/6787] eta 0:17:54 lr 0.000200 time 0.2518 (0.2505) loss 0.3692 (0.3635) grad_norm 613638.8750 (367883.0938) mem 14543MB +[2023-10-11 04:57:15 simmim_pretrain](main_simmim.py 218): INFO Train: [54/200][3000/6787] eta 0:15:48 lr 0.000200 time 0.2521 (0.2505) loss 0.3762 (0.3634) grad_norm 534723.0000 (387348.2812) mem 14543MB +[2023-10-11 04:59:20 simmim_pretrain](main_simmim.py 218): INFO Train: [54/200][3500/6787] eta 0:13:43 lr 0.000200 time 0.2498 (0.2504) loss 0.3880 (0.3635) grad_norm 347911.3438 (inf) mem 14543MB +[2023-10-11 05:01:25 simmim_pretrain](main_simmim.py 218): INFO Train: [54/200][4000/6787] eta 0:11:37 lr 0.000200 time 0.2631 (0.2503) loss 0.3488 (0.3635) grad_norm 324752.1250 (inf) mem 14543MB +[2023-10-11 05:03:30 simmim_pretrain](main_simmim.py 218): INFO Train: [54/200][4500/6787] eta 0:09:32 lr 0.000200 time 0.2465 (0.2503) loss 0.3550 (0.3636) grad_norm 306758.8438 (inf) mem 14543MB +[2023-10-11 05:05:34 simmim_pretrain](main_simmim.py 218): INFO Train: [54/200][5000/6787] eta 0:07:27 lr 0.000200 time 0.2519 (0.2502) loss 0.3842 (0.3638) grad_norm 284463.9375 (inf) mem 14543MB +[2023-10-11 05:07:39 simmim_pretrain](main_simmim.py 218): INFO Train: [54/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2536 (0.2502) loss 0.3631 (0.3638) grad_norm 309042.9062 (inf) mem 14543MB +[2023-10-11 05:09:44 simmim_pretrain](main_simmim.py 218): INFO Train: [54/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2523 (0.2502) loss 0.3498 (0.3637) grad_norm 571494.4375 (inf) mem 14543MB +[2023-10-11 05:11:49 simmim_pretrain](main_simmim.py 218): INFO Train: [54/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2506 (0.2501) loss 0.3789 (0.3636) grad_norm 316518.9375 (inf) mem 14543MB +[2023-10-11 05:13:01 simmim_pretrain](main_simmim.py 228): INFO EPOCH 54 training takes 0:28:18 +[2023-10-11 05:13:03 simmim_pretrain](main_simmim.py 218): INFO Train: [55/200][0/6787] eta 2:39:58 lr 0.000200 time 1.4142 (1.4142) loss 0.3666 (0.3666) grad_norm 373138.6250 (373138.6250) mem 14543MB +[2023-10-11 05:15:07 simmim_pretrain](main_simmim.py 218): INFO Train: [55/200][500/6787] eta 0:26:23 lr 0.000200 time 0.2470 (0.2518) loss 0.3757 (0.3639) grad_norm 201458.0781 (304766.2812) mem 14543MB +[2023-10-11 05:17:12 simmim_pretrain](main_simmim.py 218): INFO Train: [55/200][1000/6787] eta 0:24:10 lr 0.000200 time 0.2594 (0.2506) loss 0.3644 (0.3637) grad_norm 237650.2344 (303395.6250) mem 14543MB +[2023-10-11 05:19:17 simmim_pretrain](main_simmim.py 218): INFO Train: [55/200][1500/6787] eta 0:22:02 lr 0.000200 time 0.2459 (0.2502) loss 0.3680 (0.3641) grad_norm 388402.5938 (304719.1562) mem 14543MB +[2023-10-11 05:21:21 simmim_pretrain](main_simmim.py 218): INFO Train: [55/200][2000/6787] eta 0:19:56 lr 0.000200 time 0.2467 (0.2500) loss 0.3593 (0.3640) grad_norm 432491.8125 (337349.0312) mem 14543MB +[2023-10-11 05:23:26 simmim_pretrain](main_simmim.py 218): INFO Train: [55/200][2500/6787] eta 0:17:51 lr 0.000200 time 0.2547 (0.2498) loss 0.3549 (0.3634) grad_norm 413327.5000 (inf) mem 14543MB +[2023-10-11 05:25:31 simmim_pretrain](main_simmim.py 218): INFO Train: [55/200][3000/6787] eta 0:15:45 lr 0.000200 time 0.2427 (0.2498) loss 0.3729 (0.3636) grad_norm 169436.2656 (inf) mem 14543MB +[2023-10-11 05:27:36 simmim_pretrain](main_simmim.py 218): INFO Train: [55/200][3500/6787] eta 0:13:40 lr 0.000200 time 0.2521 (0.2498) loss 0.3611 (0.3637) grad_norm 251431.1406 (inf) mem 14543MB +[2023-10-11 05:29:40 simmim_pretrain](main_simmim.py 218): INFO Train: [55/200][4000/6787] eta 0:11:35 lr 0.000200 time 0.2472 (0.2497) loss 0.3703 (0.3637) grad_norm 263475.0625 (inf) mem 14543MB +[2023-10-11 05:31:45 simmim_pretrain](main_simmim.py 218): INFO Train: [55/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2472 (0.2497) loss 0.3467 (0.3640) grad_norm 515986.5938 (inf) mem 14543MB +[2023-10-11 05:33:50 simmim_pretrain](main_simmim.py 218): INFO Train: [55/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2469 (0.2497) loss 0.3753 (0.3640) grad_norm 357400.7500 (inf) mem 14543MB +[2023-10-11 05:35:54 simmim_pretrain](main_simmim.py 218): INFO Train: [55/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2465 (0.2496) loss 0.3626 (0.3640) grad_norm 316332.9688 (inf) mem 14543MB +[2023-10-11 05:37:59 simmim_pretrain](main_simmim.py 218): INFO Train: [55/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2474 (0.2496) loss 0.3652 (0.3641) grad_norm 306102.3125 (inf) mem 14543MB +[2023-10-11 05:40:04 simmim_pretrain](main_simmim.py 218): INFO Train: [55/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2457 (0.2496) loss 0.3545 (0.3640) grad_norm 312450.6875 (inf) mem 14543MB +[2023-10-11 05:41:16 simmim_pretrain](main_simmim.py 228): INFO EPOCH 55 training takes 0:28:14 +[2023-10-11 05:41:18 simmim_pretrain](main_simmim.py 218): INFO Train: [56/200][0/6787] eta 2:54:25 lr 0.000200 time 1.5420 (1.5420) loss 0.3633 (0.3633) grad_norm 213465.1719 (213465.1719) mem 14543MB +[2023-10-11 05:43:22 simmim_pretrain](main_simmim.py 218): INFO Train: [56/200][500/6787] eta 0:26:25 lr 0.000200 time 0.2522 (0.2522) loss 0.3426 (0.3637) grad_norm 549147.5000 (410852.0938) mem 14543MB +[2023-10-11 05:45:27 simmim_pretrain](main_simmim.py 218): INFO Train: [56/200][1000/6787] eta 0:24:11 lr 0.000200 time 0.2564 (0.2509) loss 0.3264 (0.3629) grad_norm 712221.4375 (437120.6562) mem 14543MB +[2023-10-11 05:47:32 simmim_pretrain](main_simmim.py 218): INFO Train: [56/200][1500/6787] eta 0:22:04 lr 0.000200 time 0.2465 (0.2506) loss 0.3655 (0.3626) grad_norm 872100.8750 (461076.6562) mem 14543MB +[2023-10-11 05:49:37 simmim_pretrain](main_simmim.py 218): INFO Train: [56/200][2000/6787] eta 0:19:58 lr 0.000200 time 0.2496 (0.2504) loss 0.5545 (0.3697) grad_norm 2022.5883 (inf) mem 14543MB +[2023-10-11 05:51:42 simmim_pretrain](main_simmim.py 218): INFO Train: [56/200][2500/6787] eta 0:17:52 lr 0.000200 time 0.2460 (0.2502) loss 0.4950 (0.3989) grad_norm 5994.0884 (inf) mem 14543MB +[2023-10-11 05:53:46 simmim_pretrain](main_simmim.py 218): INFO Train: [56/200][3000/6787] eta 0:15:46 lr 0.000200 time 0.2493 (0.2500) loss 0.4947 (0.4146) grad_norm 5440.5225 (inf) mem 14543MB +[2023-10-11 05:55:51 simmim_pretrain](main_simmim.py 218): INFO Train: [56/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2496 (0.2499) loss 0.4753 (0.4229) grad_norm 9829.1738 (inf) mem 14543MB +[2023-10-11 05:57:56 simmim_pretrain](main_simmim.py 218): INFO Train: [56/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2491 (0.2498) loss 0.4198 (0.4257) grad_norm 14463.5820 (inf) mem 14543MB +[2023-10-11 06:00:01 simmim_pretrain](main_simmim.py 218): INFO Train: [56/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2492 (0.2498) loss 0.3962 (0.4241) grad_norm 20710.0352 (inf) mem 14543MB +[2023-10-11 06:02:05 simmim_pretrain](main_simmim.py 218): INFO Train: [56/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2490 (0.2498) loss 0.3704 (0.4213) grad_norm 10343.1875 (inf) mem 14543MB +[2023-10-11 06:04:10 simmim_pretrain](main_simmim.py 218): INFO Train: [56/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2523 (0.2498) loss 0.3971 (0.4183) grad_norm 8229.6523 (inf) mem 14543MB +[2023-10-11 06:06:15 simmim_pretrain](main_simmim.py 218): INFO Train: [56/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2467 (0.2498) loss 0.3810 (0.4151) grad_norm 9307.1475 (inf) mem 14543MB +[2023-10-11 06:08:20 simmim_pretrain](main_simmim.py 218): INFO Train: [56/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2593 (0.2498) loss 0.3769 (0.4120) grad_norm 17468.2266 (inf) mem 14543MB +[2023-10-11 06:09:32 simmim_pretrain](main_simmim.py 228): INFO EPOCH 56 training takes 0:28:15 +[2023-10-11 06:09:33 simmim_pretrain](main_simmim.py 218): INFO Train: [57/200][0/6787] eta 2:46:16 lr 0.000200 time 1.4699 (1.4699) loss 0.3819 (0.3819) grad_norm 21356.7773 (21356.7773) mem 14543MB +[2023-10-11 06:11:38 simmim_pretrain](main_simmim.py 218): INFO Train: [57/200][500/6787] eta 0:26:20 lr 0.000200 time 0.2459 (0.2515) loss 0.3795 (0.3728) grad_norm 28660.2031 (24196.7012) mem 14543MB +[2023-10-11 06:13:42 simmim_pretrain](main_simmim.py 218): INFO Train: [57/200][1000/6787] eta 0:24:08 lr 0.000200 time 0.2484 (0.2503) loss 0.3651 (0.3726) grad_norm 20412.0098 (24111.9883) mem 14543MB +[2023-10-11 06:15:47 simmim_pretrain](main_simmim.py 218): INFO Train: [57/200][1500/6787] eta 0:22:01 lr 0.000200 time 0.2529 (0.2499) loss 0.3744 (0.3717) grad_norm 18188.6523 (25952.2031) mem 14543MB +[2023-10-11 06:17:52 simmim_pretrain](main_simmim.py 218): INFO Train: [57/200][2000/6787] eta 0:19:55 lr 0.000200 time 0.2449 (0.2497) loss 0.3675 (0.3710) grad_norm 52481.3477 (28705.1250) mem 14543MB +[2023-10-11 06:19:56 simmim_pretrain](main_simmim.py 218): INFO Train: [57/200][2500/6787] eta 0:17:50 lr 0.000200 time 0.2547 (0.2496) loss 0.3701 (0.3705) grad_norm 44398.6250 (31606.7324) mem 14543MB +[2023-10-11 06:22:01 simmim_pretrain](main_simmim.py 218): INFO Train: [57/200][3000/6787] eta 0:15:45 lr 0.000200 time 0.2451 (0.2495) loss 0.3719 (0.3702) grad_norm 44861.0195 (33857.7969) mem 14543MB +[2023-10-11 06:24:05 simmim_pretrain](main_simmim.py 218): INFO Train: [57/200][3500/6787] eta 0:13:40 lr 0.000200 time 0.2491 (0.2495) loss 0.3695 (0.3697) grad_norm 37820.1406 (36847.7344) mem 14543MB +[2023-10-11 06:26:10 simmim_pretrain](main_simmim.py 218): INFO Train: [57/200][4000/6787] eta 0:11:35 lr 0.000200 time 0.2523 (0.2495) loss 0.3829 (0.3693) grad_norm 89612.6172 (40809.8164) mem 14543MB +[2023-10-11 06:28:15 simmim_pretrain](main_simmim.py 218): INFO Train: [57/200][4500/6787] eta 0:09:30 lr 0.000200 time 0.2495 (0.2495) loss 0.3915 (0.3689) grad_norm 95887.1953 (44727.4023) mem 14543MB +[2023-10-11 06:30:20 simmim_pretrain](main_simmim.py 218): INFO Train: [57/200][5000/6787] eta 0:07:25 lr 0.000200 time 0.2463 (0.2495) loss 0.3682 (0.3686) grad_norm 36438.8633 (48597.1172) mem 14543MB +[2023-10-11 06:32:24 simmim_pretrain](main_simmim.py 218): INFO Train: [57/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2454 (0.2495) loss 0.3444 (0.3683) grad_norm 161019.0781 (54351.9453) mem 14543MB +[2023-10-11 06:34:29 simmim_pretrain](main_simmim.py 218): INFO Train: [57/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2469 (0.2495) loss 0.3418 (0.3681) grad_norm 176578.9219 (63988.3828) mem 14543MB +[2023-10-11 06:36:34 simmim_pretrain](main_simmim.py 218): INFO Train: [57/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2450 (0.2495) loss 0.3682 (0.3678) grad_norm 141154.0781 (71518.5469) mem 14543MB +[2023-10-11 06:37:46 simmim_pretrain](main_simmim.py 228): INFO EPOCH 57 training takes 0:28:14 +[2023-10-11 06:37:47 simmim_pretrain](main_simmim.py 218): INFO Train: [58/200][0/6787] eta 2:47:45 lr 0.000200 time 1.4830 (1.4830) loss 0.3654 (0.3654) grad_norm 121403.5547 (121403.5547) mem 14543MB +[2023-10-11 06:39:52 simmim_pretrain](main_simmim.py 218): INFO Train: [58/200][500/6787] eta 0:26:21 lr 0.000200 time 0.2486 (0.2515) loss 0.3538 (0.3654) grad_norm 267333.2500 (169665.2656) mem 14543MB +[2023-10-11 06:41:57 simmim_pretrain](main_simmim.py 218): INFO Train: [58/200][1000/6787] eta 0:24:10 lr 0.000200 time 0.2522 (0.2506) loss 0.3564 (0.3646) grad_norm 210185.5156 (195570.2812) mem 14543MB +[2023-10-11 06:44:02 simmim_pretrain](main_simmim.py 218): INFO Train: [58/200][1500/6787] eta 0:22:03 lr 0.000200 time 0.2527 (0.2504) loss 0.3647 (0.3641) grad_norm 507732.5625 (231626.2344) mem 14543MB +[2023-10-11 06:46:07 simmim_pretrain](main_simmim.py 218): INFO Train: [58/200][2000/6787] eta 0:19:58 lr 0.000200 time 0.2518 (0.2503) loss 0.3607 (0.3639) grad_norm 597508.8125 (262564.0312) mem 14543MB +[2023-10-11 06:48:12 simmim_pretrain](main_simmim.py 218): INFO Train: [58/200][2500/6787] eta 0:17:53 lr 0.000200 time 0.2594 (0.2503) loss 0.3597 (0.3638) grad_norm 246748.2656 (inf) mem 14543MB +[2023-10-11 06:50:17 simmim_pretrain](main_simmim.py 218): INFO Train: [58/200][3000/6787] eta 0:15:47 lr 0.000200 time 0.2539 (0.2502) loss 0.3564 (0.3639) grad_norm 331113.1875 (inf) mem 14543MB +[2023-10-11 06:52:22 simmim_pretrain](main_simmim.py 218): INFO Train: [58/200][3500/6787] eta 0:13:42 lr 0.000200 time 0.2462 (0.2501) loss 0.3629 (0.3637) grad_norm 260113.2031 (inf) mem 14543MB +[2023-10-11 06:54:27 simmim_pretrain](main_simmim.py 218): INFO Train: [58/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2487 (0.2501) loss 0.3716 (0.3637) grad_norm 329937.1875 (inf) mem 14543MB +[2023-10-11 06:56:32 simmim_pretrain](main_simmim.py 218): INFO Train: [58/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2471 (0.2501) loss 0.3602 (0.3637) grad_norm 300800.4062 (inf) mem 14543MB +[2023-10-11 06:58:36 simmim_pretrain](main_simmim.py 218): INFO Train: [58/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2513 (0.2500) loss 0.3538 (0.3636) grad_norm 385134.5938 (inf) mem 14543MB +[2023-10-11 07:00:41 simmim_pretrain](main_simmim.py 218): INFO Train: [58/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2467 (0.2500) loss 0.3767 (0.3637) grad_norm 488168.4062 (inf) mem 14543MB +[2023-10-11 07:02:46 simmim_pretrain](main_simmim.py 218): INFO Train: [58/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2507 (0.2499) loss 0.3658 (0.3637) grad_norm 295277.1562 (inf) mem 14543MB +[2023-10-11 07:04:51 simmim_pretrain](main_simmim.py 218): INFO Train: [58/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2490 (0.2499) loss 0.3619 (0.3637) grad_norm 318438.1875 (inf) mem 14543MB +[2023-10-11 07:06:03 simmim_pretrain](main_simmim.py 228): INFO EPOCH 58 training takes 0:28:16 +[2023-10-11 07:06:04 simmim_pretrain](main_simmim.py 218): INFO Train: [59/200][0/6787] eta 2:48:09 lr 0.000200 time 1.4866 (1.4866) loss 0.3861 (0.3861) grad_norm 171094.7188 (171094.7188) mem 14543MB +[2023-10-11 07:08:09 simmim_pretrain](main_simmim.py 218): INFO Train: [59/200][500/6787] eta 0:26:22 lr 0.000200 time 0.2556 (0.2517) loss 0.3893 (0.3650) grad_norm 263277.6875 (234104.7812) mem 14543MB +[2023-10-11 07:10:14 simmim_pretrain](main_simmim.py 218): INFO Train: [59/200][1000/6787] eta 0:24:10 lr 0.000200 time 0.2506 (0.2507) loss 0.3592 (0.3648) grad_norm 168489.4688 (224050.7500) mem 14543MB +[2023-10-11 07:12:18 simmim_pretrain](main_simmim.py 218): INFO Train: [59/200][1500/6787] eta 0:22:03 lr 0.000200 time 0.2592 (0.2502) loss 0.3361 (0.3645) grad_norm 201829.3594 (227714.0938) mem 14543MB +[2023-10-11 07:14:23 simmim_pretrain](main_simmim.py 218): INFO Train: [59/200][2000/6787] eta 0:19:57 lr 0.000200 time 0.2533 (0.2501) loss 0.3629 (0.3643) grad_norm 548683.4375 (247417.6250) mem 14543MB +[2023-10-11 07:16:30 simmim_pretrain](main_simmim.py 218): INFO Train: [59/200][2500/6787] eta 0:17:55 lr 0.000200 time 0.2540 (0.2509) loss 0.3416 (0.3641) grad_norm 536888.0000 (278132.8750) mem 14543MB +[2023-10-11 07:18:37 simmim_pretrain](main_simmim.py 218): INFO Train: [59/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2593 (0.2514) loss 0.3531 (0.3641) grad_norm 857486.1875 (288827.4375) mem 14543MB +[2023-10-11 07:20:44 simmim_pretrain](main_simmim.py 218): INFO Train: [59/200][3500/6787] eta 0:13:47 lr 0.000200 time 0.2535 (0.2518) loss 0.3582 (0.3637) grad_norm 247973.7969 (inf) mem 14543MB +[2023-10-11 07:22:51 simmim_pretrain](main_simmim.py 218): INFO Train: [59/200][4000/6787] eta 0:11:42 lr 0.000200 time 0.2529 (0.2520) loss 0.3634 (0.3639) grad_norm 270071.6875 (inf) mem 14543MB +[2023-10-11 07:24:58 simmim_pretrain](main_simmim.py 218): INFO Train: [59/200][4500/6787] eta 0:09:36 lr 0.000200 time 0.2536 (0.2522) loss 0.3653 (0.3640) grad_norm 241580.2812 (inf) mem 14543MB +[2023-10-11 07:27:05 simmim_pretrain](main_simmim.py 218): INFO Train: [59/200][5000/6787] eta 0:07:31 lr 0.000200 time 0.2531 (0.2524) loss 0.3675 (0.3640) grad_norm 176652.3750 (inf) mem 14543MB +[2023-10-11 07:29:12 simmim_pretrain](main_simmim.py 218): INFO Train: [59/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2531 (0.2525) loss 0.3799 (0.3639) grad_norm 274522.0625 (inf) mem 14543MB +[2023-10-11 07:31:19 simmim_pretrain](main_simmim.py 218): INFO Train: [59/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2531 (0.2526) loss 0.3697 (0.3639) grad_norm 293115.8125 (inf) mem 14543MB +[2023-10-11 07:33:26 simmim_pretrain](main_simmim.py 218): INFO Train: [59/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2538 (0.2527) loss 0.3594 (0.3639) grad_norm 525726.6250 (inf) mem 14543MB +[2023-10-11 07:34:39 simmim_pretrain](main_simmim.py 228): INFO EPOCH 59 training takes 0:28:36 +[2023-10-11 07:34:41 simmim_pretrain](main_simmim.py 218): INFO Train: [60/200][0/6787] eta 3:02:51 lr 0.000200 time 1.6166 (1.6166) loss 0.3566 (0.3566) grad_norm 179210.2656 (179210.2656) mem 14543MB +[2023-10-11 07:36:45 simmim_pretrain](main_simmim.py 218): INFO Train: [60/200][500/6787] eta 0:26:21 lr 0.000200 time 0.2518 (0.2516) loss 0.3408 (0.3640) grad_norm 217925.3125 (245494.9688) mem 14543MB +[2023-10-11 07:38:50 simmim_pretrain](main_simmim.py 218): INFO Train: [60/200][1000/6787] eta 0:24:09 lr 0.000200 time 0.2532 (0.2505) loss 0.3674 (0.3640) grad_norm 250901.8125 (238155.1094) mem 14543MB +[2023-10-11 07:40:54 simmim_pretrain](main_simmim.py 218): INFO Train: [60/200][1500/6787] eta 0:22:02 lr 0.000200 time 0.2460 (0.2501) loss 0.3567 (0.3637) grad_norm 227277.7812 (243554.0469) mem 14543MB +[2023-10-11 07:42:59 simmim_pretrain](main_simmim.py 218): INFO Train: [60/200][2000/6787] eta 0:19:56 lr 0.000200 time 0.2484 (0.2500) loss 0.3621 (0.3638) grad_norm 438943.6250 (242379.9688) mem 14543MB +[2023-10-11 07:45:04 simmim_pretrain](main_simmim.py 218): INFO Train: [60/200][2500/6787] eta 0:17:51 lr 0.000200 time 0.2474 (0.2499) loss 0.3718 (0.3635) grad_norm 275961.3438 (259831.9062) mem 14543MB +[2023-10-11 07:47:09 simmim_pretrain](main_simmim.py 218): INFO Train: [60/200][3000/6787] eta 0:15:46 lr 0.000200 time 0.2547 (0.2499) loss 0.3587 (0.3635) grad_norm 506672.1875 (278491.7188) mem 14543MB +[2023-10-11 07:49:14 simmim_pretrain](main_simmim.py 218): INFO Train: [60/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2521 (0.2499) loss 0.3585 (0.3634) grad_norm 366454.0312 (289703.6250) mem 14543MB +[2023-10-11 07:51:19 simmim_pretrain](main_simmim.py 218): INFO Train: [60/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2462 (0.2499) loss 0.3689 (0.3632) grad_norm 881423.0625 (inf) mem 14543MB +[2023-10-11 07:53:24 simmim_pretrain](main_simmim.py 218): INFO Train: [60/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2464 (0.2499) loss 0.3500 (0.3630) grad_norm 222504.4375 (inf) mem 14543MB +[2023-10-11 07:55:28 simmim_pretrain](main_simmim.py 218): INFO Train: [60/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2489 (0.2498) loss 0.3705 (0.3629) grad_norm 283103.1250 (inf) mem 14543MB +[2023-10-11 07:57:33 simmim_pretrain](main_simmim.py 218): INFO Train: [60/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2499 (0.2498) loss 0.3385 (0.3628) grad_norm 414764.5938 (inf) mem 14543MB +[2023-10-11 07:59:38 simmim_pretrain](main_simmim.py 218): INFO Train: [60/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2474 (0.2498) loss 0.3709 (0.3627) grad_norm 751219.0625 (inf) mem 14543MB +[2023-10-11 08:01:43 simmim_pretrain](main_simmim.py 218): INFO Train: [60/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2481 (0.2498) loss 0.3724 (0.3627) grad_norm 497558.5625 (inf) mem 14543MB +[2023-10-11 08:02:55 simmim_pretrain](main_simmim.py 228): INFO EPOCH 60 training takes 0:28:16 +[2023-10-11 08:02:55 simmim_pretrain](utils.py 62): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_60.pth saving...... +[2023-10-11 08:02:56 simmim_pretrain](utils.py 64): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_60.pth saved !!! +[2023-10-11 08:02:57 simmim_pretrain](main_simmim.py 218): INFO Train: [61/200][0/6787] eta 2:28:35 lr 0.000200 time 1.3136 (1.3136) loss 0.3442 (0.3442) grad_norm 527017.0000 (527017.0000) mem 14543MB +[2023-10-11 08:05:02 simmim_pretrain](main_simmim.py 218): INFO Train: [61/200][500/6787] eta 0:26:19 lr 0.000200 time 0.2487 (0.2512) loss 0.3651 (0.3636) grad_norm 251129.5156 (467523.2500) mem 14543MB +[2023-10-11 08:07:07 simmim_pretrain](main_simmim.py 218): INFO Train: [61/200][1000/6787] eta 0:24:09 lr 0.000200 time 0.2459 (0.2504) loss 0.3940 (0.3634) grad_norm 328609.7812 (inf) mem 14543MB +[2023-10-11 08:09:11 simmim_pretrain](main_simmim.py 218): INFO Train: [61/200][1500/6787] eta 0:22:02 lr 0.000200 time 0.2454 (0.2501) loss 0.3607 (0.3638) grad_norm 265209.6875 (inf) mem 14543MB +[2023-10-11 08:11:16 simmim_pretrain](main_simmim.py 218): INFO Train: [61/200][2000/6787] eta 0:19:56 lr 0.000200 time 0.2449 (0.2500) loss 0.3559 (0.3638) grad_norm 239099.8906 (inf) mem 14543MB +[2023-10-11 08:13:21 simmim_pretrain](main_simmim.py 218): INFO Train: [61/200][2500/6787] eta 0:17:51 lr 0.000200 time 0.2492 (0.2499) loss 0.3824 (0.3638) grad_norm 151659.6094 (inf) mem 14543MB +[2023-10-11 08:15:26 simmim_pretrain](main_simmim.py 218): INFO Train: [61/200][3000/6787] eta 0:15:46 lr 0.000200 time 0.2590 (0.2498) loss 0.3791 (0.3642) grad_norm 159163.5938 (inf) mem 14543MB +[2023-10-11 08:17:30 simmim_pretrain](main_simmim.py 218): INFO Train: [61/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2501 (0.2498) loss 0.3595 (0.3645) grad_norm 166781.5938 (inf) mem 14543MB +[2023-10-11 08:19:35 simmim_pretrain](main_simmim.py 218): INFO Train: [61/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2500 (0.2498) loss 0.3789 (0.3647) grad_norm 167285.2500 (inf) mem 14543MB +[2023-10-11 08:21:40 simmim_pretrain](main_simmim.py 218): INFO Train: [61/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2496 (0.2498) loss 0.3309 (0.3647) grad_norm 169171.1406 (inf) mem 14543MB +[2023-10-11 08:23:45 simmim_pretrain](main_simmim.py 218): INFO Train: [61/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2484 (0.2497) loss 0.3769 (0.3646) grad_norm 96959.6484 (inf) mem 14543MB +[2023-10-11 08:25:50 simmim_pretrain](main_simmim.py 218): INFO Train: [61/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2493 (0.2497) loss 0.3428 (0.3645) grad_norm 251174.3125 (inf) mem 14543MB +[2023-10-11 08:27:55 simmim_pretrain](main_simmim.py 218): INFO Train: [61/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2491 (0.2498) loss 0.3385 (0.3645) grad_norm 231342.7500 (inf) mem 14543MB +[2023-10-11 08:29:59 simmim_pretrain](main_simmim.py 218): INFO Train: [61/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2511 (0.2497) loss 0.3734 (0.3643) grad_norm 293117.8750 (inf) mem 14543MB +[2023-10-11 08:31:12 simmim_pretrain](main_simmim.py 228): INFO EPOCH 61 training takes 0:28:15 +[2023-10-11 08:31:13 simmim_pretrain](main_simmim.py 218): INFO Train: [62/200][0/6787] eta 2:53:16 lr 0.000200 time 1.5318 (1.5318) loss 0.3577 (0.3577) grad_norm 260462.5156 (260462.5156) mem 14543MB +[2023-10-11 08:33:18 simmim_pretrain](main_simmim.py 218): INFO Train: [62/200][500/6787] eta 0:26:21 lr 0.000200 time 0.2467 (0.2516) loss 0.3852 (0.3623) grad_norm 190863.5781 (297772.9062) mem 14543MB +[2023-10-11 08:35:23 simmim_pretrain](main_simmim.py 218): INFO Train: [62/200][1000/6787] eta 0:24:09 lr 0.000200 time 0.2466 (0.2505) loss 0.3516 (0.3625) grad_norm 310195.0938 (340491.5000) mem 14543MB +[2023-10-11 08:37:27 simmim_pretrain](main_simmim.py 218): INFO Train: [62/200][1500/6787] eta 0:22:02 lr 0.000200 time 0.2496 (0.2502) loss 0.3759 (0.3624) grad_norm 224799.9531 (352381.9062) mem 14543MB +[2023-10-11 08:39:32 simmim_pretrain](main_simmim.py 218): INFO Train: [62/200][2000/6787] eta 0:19:56 lr 0.000200 time 0.2490 (0.2500) loss 0.3648 (0.3622) grad_norm 453300.7500 (inf) mem 14543MB +[2023-10-11 08:41:37 simmim_pretrain](main_simmim.py 218): INFO Train: [62/200][2500/6787] eta 0:17:51 lr 0.000200 time 0.2493 (0.2499) loss 0.3595 (0.3622) grad_norm 396486.9688 (inf) mem 14543MB +[2023-10-11 08:43:42 simmim_pretrain](main_simmim.py 218): INFO Train: [62/200][3000/6787] eta 0:15:46 lr 0.000200 time 0.2506 (0.2499) loss 0.3502 (0.3623) grad_norm 285901.0938 (inf) mem 14543MB +[2023-10-11 08:45:48 simmim_pretrain](main_simmim.py 218): INFO Train: [62/200][3500/6787] eta 0:13:42 lr 0.000200 time 0.2506 (0.2502) loss 0.3545 (0.3624) grad_norm 243416.7812 (inf) mem 14543MB +[2023-10-11 08:47:54 simmim_pretrain](main_simmim.py 218): INFO Train: [62/200][4000/6787] eta 0:11:37 lr 0.000200 time 0.2514 (0.2504) loss 0.3689 (0.3624) grad_norm 335249.0312 (inf) mem 14543MB +[2023-10-11 08:50:00 simmim_pretrain](main_simmim.py 218): INFO Train: [62/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2542 (0.2506) loss 0.3760 (0.3626) grad_norm 213098.8438 (inf) mem 14543MB +[2023-10-11 08:52:06 simmim_pretrain](main_simmim.py 218): INFO Train: [62/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2513 (0.2508) loss 0.3536 (0.3625) grad_norm 279225.0312 (inf) mem 14543MB +[2023-10-11 08:54:12 simmim_pretrain](main_simmim.py 218): INFO Train: [62/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2505 (0.2509) loss 0.3821 (0.3625) grad_norm 453808.2812 (inf) mem 14543MB +[2023-10-11 08:56:18 simmim_pretrain](main_simmim.py 218): INFO Train: [62/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2568 (0.2509) loss 0.3696 (0.3625) grad_norm 157112.4062 (inf) mem 14543MB +[2023-10-11 08:58:24 simmim_pretrain](main_simmim.py 218): INFO Train: [62/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2507 (0.2510) loss 0.3533 (0.3625) grad_norm 936199.6875 (inf) mem 14543MB +[2023-10-11 08:59:37 simmim_pretrain](main_simmim.py 228): INFO EPOCH 62 training takes 0:28:24 +[2023-10-11 08:59:38 simmim_pretrain](main_simmim.py 218): INFO Train: [63/200][0/6787] eta 3:01:41 lr 0.000200 time 1.6062 (1.6062) loss 0.3561 (0.3561) grad_norm 489068.6562 (489068.6562) mem 14543MB +[2023-10-11 09:01:43 simmim_pretrain](main_simmim.py 218): INFO Train: [63/200][500/6787] eta 0:26:25 lr 0.000200 time 0.2545 (0.2522) loss 0.3433 (0.3618) grad_norm 523721.9062 (454235.5000) mem 14543MB +[2023-10-11 09:03:48 simmim_pretrain](main_simmim.py 218): INFO Train: [63/200][1000/6787] eta 0:24:11 lr 0.000200 time 0.2458 (0.2509) loss 0.3728 (0.3621) grad_norm 413777.7500 (447603.9688) mem 14543MB +[2023-10-11 09:05:52 simmim_pretrain](main_simmim.py 218): INFO Train: [63/200][1500/6787] eta 0:22:03 lr 0.000200 time 0.2459 (0.2504) loss 0.3750 (0.3621) grad_norm 574363.0625 (459671.6250) mem 14543MB +[2023-10-11 09:07:57 simmim_pretrain](main_simmim.py 218): INFO Train: [63/200][2000/6787] eta 0:19:57 lr 0.000200 time 0.2469 (0.2502) loss 0.3439 (0.3623) grad_norm 399589.4688 (inf) mem 14543MB +[2023-10-11 09:10:02 simmim_pretrain](main_simmim.py 218): INFO Train: [63/200][2500/6787] eta 0:17:52 lr 0.000200 time 0.2477 (0.2501) loss 0.3575 (0.3626) grad_norm 300643.6250 (inf) mem 14543MB +[2023-10-11 09:12:07 simmim_pretrain](main_simmim.py 218): INFO Train: [63/200][3000/6787] eta 0:15:46 lr 0.000200 time 0.2464 (0.2499) loss 0.3751 (0.3629) grad_norm 348417.5312 (inf) mem 14543MB +[2023-10-11 09:14:11 simmim_pretrain](main_simmim.py 218): INFO Train: [63/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2565 (0.2499) loss 0.3590 (0.3630) grad_norm 212575.5156 (inf) mem 14543MB +[2023-10-11 09:16:16 simmim_pretrain](main_simmim.py 218): INFO Train: [63/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2499 (0.2498) loss 0.3504 (0.3629) grad_norm 675239.0000 (inf) mem 14543MB +[2023-10-11 09:18:21 simmim_pretrain](main_simmim.py 218): INFO Train: [63/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2474 (0.2498) loss 0.3493 (0.3628) grad_norm 490092.1875 (inf) mem 14543MB +[2023-10-11 09:20:25 simmim_pretrain](main_simmim.py 218): INFO Train: [63/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2461 (0.2497) loss 0.3670 (0.3628) grad_norm 478417.6875 (inf) mem 14543MB +[2023-10-11 09:22:30 simmim_pretrain](main_simmim.py 218): INFO Train: [63/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2478 (0.2497) loss 0.3536 (0.3628) grad_norm 457909.7500 (inf) mem 14543MB +[2023-10-11 09:24:34 simmim_pretrain](main_simmim.py 218): INFO Train: [63/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2482 (0.2496) loss 0.3527 (0.3628) grad_norm 247812.6875 (inf) mem 14543MB +[2023-10-11 09:26:39 simmim_pretrain](main_simmim.py 218): INFO Train: [63/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2483 (0.2496) loss 0.3410 (0.3628) grad_norm 300270.0938 (inf) mem 14543MB +[2023-10-11 09:27:51 simmim_pretrain](main_simmim.py 228): INFO EPOCH 63 training takes 0:28:14 +[2023-10-11 09:27:53 simmim_pretrain](main_simmim.py 218): INFO Train: [64/200][0/6787] eta 3:07:11 lr 0.000200 time 1.6549 (1.6549) loss 0.3408 (0.3408) grad_norm 326039.5938 (326039.5938) mem 14543MB +[2023-10-11 09:29:58 simmim_pretrain](main_simmim.py 218): INFO Train: [64/200][500/6787] eta 0:26:26 lr 0.000200 time 0.2490 (0.2523) loss 0.3527 (0.3650) grad_norm 271117.4062 (266561.8438) mem 14543MB +[2023-10-11 09:32:03 simmim_pretrain](main_simmim.py 218): INFO Train: [64/200][1000/6787] eta 0:24:14 lr 0.000200 time 0.2486 (0.2513) loss 0.3396 (0.3642) grad_norm 259139.3594 (259407.6562) mem 14543MB +[2023-10-11 09:34:08 simmim_pretrain](main_simmim.py 218): INFO Train: [64/200][1500/6787] eta 0:22:06 lr 0.000200 time 0.2492 (0.2509) loss 0.3250 (0.3638) grad_norm 297275.6250 (inf) mem 14543MB +[2023-10-11 09:36:13 simmim_pretrain](main_simmim.py 218): INFO Train: [64/200][2000/6787] eta 0:20:00 lr 0.000200 time 0.2499 (0.2508) loss 0.3545 (0.3637) grad_norm 264638.2188 (inf) mem 14543MB +[2023-10-11 09:38:18 simmim_pretrain](main_simmim.py 218): INFO Train: [64/200][2500/6787] eta 0:17:54 lr 0.000200 time 0.2481 (0.2507) loss 0.3680 (0.3638) grad_norm 205538.4844 (inf) mem 14543MB +[2023-10-11 09:40:23 simmim_pretrain](main_simmim.py 218): INFO Train: [64/200][3000/6787] eta 0:15:49 lr 0.000200 time 0.2599 (0.2506) loss 0.3544 (0.3636) grad_norm 115810.5391 (inf) mem 14543MB +[2023-10-11 09:42:28 simmim_pretrain](main_simmim.py 218): INFO Train: [64/200][3500/6787] eta 0:13:43 lr 0.000200 time 0.2487 (0.2506) loss 0.3643 (0.3634) grad_norm 496984.2812 (inf) mem 14543MB +[2023-10-11 09:44:34 simmim_pretrain](main_simmim.py 218): INFO Train: [64/200][4000/6787] eta 0:11:38 lr 0.000200 time 0.2483 (0.2506) loss 0.3677 (0.3632) grad_norm 443846.6250 (inf) mem 14543MB +[2023-10-11 09:46:39 simmim_pretrain](main_simmim.py 218): INFO Train: [64/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2552 (0.2506) loss 0.3740 (0.3630) grad_norm 490680.4688 (inf) mem 14543MB +[2023-10-11 09:48:44 simmim_pretrain](main_simmim.py 218): INFO Train: [64/200][5000/6787] eta 0:07:27 lr 0.000200 time 0.2500 (0.2506) loss 0.3661 (0.3628) grad_norm 503764.2188 (inf) mem 14543MB +[2023-10-11 09:50:49 simmim_pretrain](main_simmim.py 218): INFO Train: [64/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2489 (0.2505) loss 0.3502 (0.3628) grad_norm 286947.0938 (inf) mem 14543MB +[2023-10-11 09:52:54 simmim_pretrain](main_simmim.py 218): INFO Train: [64/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2496 (0.2505) loss 0.3489 (0.3629) grad_norm 222102.7344 (inf) mem 14543MB +[2023-10-11 09:55:00 simmim_pretrain](main_simmim.py 218): INFO Train: [64/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2486 (0.2505) loss 0.3438 (0.3630) grad_norm 204317.4062 (inf) mem 14543MB +[2023-10-11 09:56:12 simmim_pretrain](main_simmim.py 228): INFO EPOCH 64 training takes 0:28:20 +[2023-10-11 09:56:13 simmim_pretrain](main_simmim.py 218): INFO Train: [65/200][0/6787] eta 2:49:34 lr 0.000200 time 1.4992 (1.4992) loss 0.3618 (0.3618) grad_norm 131299.1250 (131299.1250) mem 14543MB +[2023-10-11 09:58:18 simmim_pretrain](main_simmim.py 218): INFO Train: [65/200][500/6787] eta 0:26:23 lr 0.000200 time 0.2527 (0.2518) loss 0.3447 (0.3635) grad_norm 348200.5312 (inf) mem 14543MB +[2023-10-11 10:00:23 simmim_pretrain](main_simmim.py 218): INFO Train: [65/200][1000/6787] eta 0:24:10 lr 0.000200 time 0.2461 (0.2507) loss 0.3527 (0.3630) grad_norm 277952.8750 (inf) mem 14543MB +[2023-10-11 10:02:28 simmim_pretrain](main_simmim.py 218): INFO Train: [65/200][1500/6787] eta 0:22:03 lr 0.000200 time 0.2523 (0.2503) loss 0.3643 (0.3628) grad_norm 367702.1562 (inf) mem 14543MB +[2023-10-11 10:04:33 simmim_pretrain](main_simmim.py 218): INFO Train: [65/200][2000/6787] eta 0:19:57 lr 0.000200 time 0.2454 (0.2502) loss 0.3575 (0.3631) grad_norm 337869.9062 (inf) mem 14543MB +[2023-10-11 10:06:37 simmim_pretrain](main_simmim.py 218): INFO Train: [65/200][2500/6787] eta 0:17:52 lr 0.000200 time 0.2485 (0.2501) loss 0.3505 (0.3631) grad_norm 530708.5000 (inf) mem 14543MB +[2023-10-11 10:08:42 simmim_pretrain](main_simmim.py 218): INFO Train: [65/200][3000/6787] eta 0:15:46 lr 0.000200 time 0.2454 (0.2499) loss 0.3634 (0.3629) grad_norm 246915.5938 (inf) mem 14543MB +[2023-10-11 10:10:47 simmim_pretrain](main_simmim.py 218): INFO Train: [65/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2470 (0.2499) loss 0.3667 (0.3627) grad_norm 420803.4062 (inf) mem 14543MB +[2023-10-11 10:12:51 simmim_pretrain](main_simmim.py 218): INFO Train: [65/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2553 (0.2498) loss 0.3589 (0.3628) grad_norm 297858.1562 (inf) mem 14543MB +[2023-10-11 10:14:56 simmim_pretrain](main_simmim.py 218): INFO Train: [65/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2465 (0.2498) loss 0.3863 (0.3629) grad_norm 377202.7812 (inf) mem 14543MB +[2023-10-11 10:17:01 simmim_pretrain](main_simmim.py 218): INFO Train: [65/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2450 (0.2497) loss 0.3784 (0.3629) grad_norm 179266.7344 (inf) mem 14543MB +[2023-10-11 10:19:06 simmim_pretrain](main_simmim.py 218): INFO Train: [65/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2492 (0.2497) loss 0.3795 (0.3629) grad_norm 300468.5625 (inf) mem 14543MB +[2023-10-11 10:21:10 simmim_pretrain](main_simmim.py 218): INFO Train: [65/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2543 (0.2497) loss 0.3467 (0.3632) grad_norm 156159.7812 (inf) mem 14543MB +[2023-10-11 10:23:15 simmim_pretrain](main_simmim.py 218): INFO Train: [65/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2463 (0.2496) loss 0.3842 (0.3633) grad_norm 104296.7266 (inf) mem 14543MB +[2023-10-11 10:24:27 simmim_pretrain](main_simmim.py 228): INFO EPOCH 65 training takes 0:28:15 +[2023-10-11 10:24:29 simmim_pretrain](main_simmim.py 218): INFO Train: [66/200][0/6787] eta 3:00:12 lr 0.000200 time 1.5931 (1.5931) loss 0.3635 (0.3635) grad_norm 122723.4219 (122723.4219) mem 14543MB +[2023-10-11 10:26:33 simmim_pretrain](main_simmim.py 218): INFO Train: [66/200][500/6787] eta 0:26:21 lr 0.000200 time 0.2484 (0.2515) loss 0.3533 (0.3657) grad_norm 147229.4219 (143896.7969) mem 14543MB +[2023-10-11 10:28:38 simmim_pretrain](main_simmim.py 218): INFO Train: [66/200][1000/6787] eta 0:24:09 lr 0.000200 time 0.2466 (0.2505) loss 0.3847 (0.3653) grad_norm 181782.2031 (144858.1562) mem 14543MB +[2023-10-11 10:30:42 simmim_pretrain](main_simmim.py 218): INFO Train: [66/200][1500/6787] eta 0:22:02 lr 0.000200 time 0.2496 (0.2501) loss 0.3636 (0.3648) grad_norm 150437.2656 (159238.7344) mem 14543MB +[2023-10-11 10:32:47 simmim_pretrain](main_simmim.py 218): INFO Train: [66/200][2000/6787] eta 0:19:56 lr 0.000200 time 0.2459 (0.2499) loss 0.3548 (0.3644) grad_norm 159439.0312 (173485.6719) mem 14543MB +[2023-10-11 10:34:52 simmim_pretrain](main_simmim.py 218): INFO Train: [66/200][2500/6787] eta 0:17:51 lr 0.000200 time 0.2466 (0.2498) loss 0.3418 (0.3641) grad_norm 197232.2656 (185502.4844) mem 14543MB +[2023-10-11 10:36:57 simmim_pretrain](main_simmim.py 218): INFO Train: [66/200][3000/6787] eta 0:15:45 lr 0.000200 time 0.2555 (0.2498) loss 0.3732 (0.3640) grad_norm 190845.1562 (195342.1094) mem 14543MB +[2023-10-11 10:39:01 simmim_pretrain](main_simmim.py 218): INFO Train: [66/200][3500/6787] eta 0:13:40 lr 0.000200 time 0.2495 (0.2497) loss 0.3524 (0.3637) grad_norm 325907.9688 (226250.8594) mem 14543MB +[2023-10-11 10:41:06 simmim_pretrain](main_simmim.py 218): INFO Train: [66/200][4000/6787] eta 0:11:35 lr 0.000200 time 0.2471 (0.2496) loss 0.3473 (0.3635) grad_norm 502769.1875 (248052.8438) mem 14543MB +[2023-10-11 10:43:10 simmim_pretrain](main_simmim.py 218): INFO Train: [66/200][4500/6787] eta 0:09:30 lr 0.000200 time 0.2482 (0.2495) loss 0.3539 (0.3632) grad_norm 220479.4688 (271358.7188) mem 14543MB +[2023-10-11 10:45:14 simmim_pretrain](main_simmim.py 218): INFO Train: [66/200][5000/6787] eta 0:07:25 lr 0.000200 time 0.2476 (0.2494) loss 0.3722 (0.3632) grad_norm 346645.1250 (inf) mem 14543MB +[2023-10-11 10:47:19 simmim_pretrain](main_simmim.py 218): INFO Train: [66/200][5500/6787] eta 0:05:20 lr 0.000200 time 0.2514 (0.2493) loss 0.3609 (0.3632) grad_norm 247438.4531 (inf) mem 14543MB +[2023-10-11 10:49:23 simmim_pretrain](main_simmim.py 218): INFO Train: [66/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2494 (0.2493) loss 0.3428 (0.3633) grad_norm 363803.6250 (inf) mem 14543MB +[2023-10-11 10:51:27 simmim_pretrain](main_simmim.py 218): INFO Train: [66/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2486 (0.2492) loss 0.3598 (0.3633) grad_norm 322327.1875 (inf) mem 14543MB +[2023-10-11 10:52:39 simmim_pretrain](main_simmim.py 228): INFO EPOCH 66 training takes 0:28:11 +[2023-10-11 10:52:40 simmim_pretrain](main_simmim.py 218): INFO Train: [67/200][0/6787] eta 2:44:39 lr 0.000200 time 1.4557 (1.4557) loss 0.3438 (0.3438) grad_norm 455356.1875 (455356.1875) mem 14543MB +[2023-10-11 10:54:44 simmim_pretrain](main_simmim.py 218): INFO Train: [67/200][500/6787] eta 0:26:13 lr 0.000200 time 0.2496 (0.2502) loss 0.3794 (0.3621) grad_norm 349354.0312 (348277.7188) mem 14543MB +[2023-10-11 10:56:48 simmim_pretrain](main_simmim.py 218): INFO Train: [67/200][1000/6787] eta 0:24:02 lr 0.000200 time 0.2464 (0.2492) loss 0.3691 (0.3619) grad_norm 512163.8750 (414587.6875) mem 14543MB +[2023-10-11 10:58:52 simmim_pretrain](main_simmim.py 218): INFO Train: [67/200][1500/6787] eta 0:21:56 lr 0.000200 time 0.2564 (0.2490) loss 0.3868 (0.3621) grad_norm 383947.5938 (463544.0312) mem 14543MB +[2023-10-11 11:00:57 simmim_pretrain](main_simmim.py 218): INFO Train: [67/200][2000/6787] eta 0:19:51 lr 0.000200 time 0.2466 (0.2489) loss 0.3638 (0.3620) grad_norm 250414.6094 (inf) mem 14543MB +[2023-10-11 11:03:01 simmim_pretrain](main_simmim.py 218): INFO Train: [67/200][2500/6787] eta 0:17:46 lr 0.000200 time 0.2545 (0.2489) loss 0.3390 (0.3623) grad_norm 149120.5625 (inf) mem 14543MB +[2023-10-11 11:05:06 simmim_pretrain](main_simmim.py 218): INFO Train: [67/200][3000/6787] eta 0:15:42 lr 0.000200 time 0.2526 (0.2489) loss 0.3642 (0.3624) grad_norm 260623.2969 (inf) mem 14543MB +[2023-10-11 11:07:11 simmim_pretrain](main_simmim.py 218): INFO Train: [67/200][3500/6787] eta 0:13:38 lr 0.000200 time 0.2499 (0.2491) loss 0.3849 (0.3625) grad_norm 246417.0781 (inf) mem 14543MB +[2023-10-11 11:09:16 simmim_pretrain](main_simmim.py 218): INFO Train: [67/200][4000/6787] eta 0:11:34 lr 0.000200 time 0.2515 (0.2491) loss 0.3563 (0.3626) grad_norm 372839.0312 (inf) mem 14543MB +[2023-10-11 11:11:20 simmim_pretrain](main_simmim.py 218): INFO Train: [67/200][4500/6787] eta 0:09:29 lr 0.000200 time 0.2491 (0.2492) loss 0.3512 (0.3625) grad_norm 336771.3438 (inf) mem 14543MB +[2023-10-11 11:13:25 simmim_pretrain](main_simmim.py 218): INFO Train: [67/200][5000/6787] eta 0:07:25 lr 0.000200 time 0.2505 (0.2493) loss 0.3544 (0.3624) grad_norm 608553.5625 (inf) mem 14543MB +[2023-10-11 11:15:30 simmim_pretrain](main_simmim.py 218): INFO Train: [67/200][5500/6787] eta 0:05:20 lr 0.000200 time 0.2592 (0.2493) loss 0.3731 (0.3624) grad_norm 258384.6094 (inf) mem 14543MB +[2023-10-11 11:17:35 simmim_pretrain](main_simmim.py 218): INFO Train: [67/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2536 (0.2493) loss 0.3622 (0.3624) grad_norm 183729.2500 (inf) mem 14543MB +[2023-10-11 11:19:40 simmim_pretrain](main_simmim.py 218): INFO Train: [67/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2547 (0.2493) loss 0.3247 (0.3625) grad_norm 233729.1250 (inf) mem 14543MB +[2023-10-11 11:20:52 simmim_pretrain](main_simmim.py 228): INFO EPOCH 67 training takes 0:28:13 +[2023-10-11 11:20:53 simmim_pretrain](main_simmim.py 218): INFO Train: [68/200][0/6787] eta 2:44:58 lr 0.000200 time 1.4584 (1.4584) loss 0.3734 (0.3734) grad_norm 218391.2656 (218391.2656) mem 14543MB +[2023-10-11 11:22:58 simmim_pretrain](main_simmim.py 218): INFO Train: [68/200][500/6787] eta 0:26:23 lr 0.000200 time 0.2456 (0.2519) loss 0.3428 (0.3631) grad_norm 305564.8125 (303168.0938) mem 14543MB +[2023-10-11 11:25:03 simmim_pretrain](main_simmim.py 218): INFO Train: [68/200][1000/6787] eta 0:24:11 lr 0.000200 time 0.2591 (0.2508) loss 0.3522 (0.3626) grad_norm 286851.2188 (331284.9062) mem 14543MB +[2023-10-11 11:27:08 simmim_pretrain](main_simmim.py 218): INFO Train: [68/200][1500/6787] eta 0:22:04 lr 0.000200 time 0.2527 (0.2505) loss 0.3554 (0.3625) grad_norm 615308.8125 (358500.2812) mem 14543MB +[2023-10-11 11:29:13 simmim_pretrain](main_simmim.py 218): INFO Train: [68/200][2000/6787] eta 0:19:58 lr 0.000200 time 0.2484 (0.2503) loss 0.3669 (0.3625) grad_norm 664551.8750 (379617.9688) mem 14543MB +[2023-10-11 11:31:18 simmim_pretrain](main_simmim.py 218): INFO Train: [68/200][2500/6787] eta 0:17:52 lr 0.000200 time 0.2596 (0.2501) loss 0.3807 (0.3625) grad_norm 158800.7656 (inf) mem 14543MB +[2023-10-11 11:33:23 simmim_pretrain](main_simmim.py 218): INFO Train: [68/200][3000/6787] eta 0:15:47 lr 0.000200 time 0.2462 (0.2501) loss 0.3644 (0.3627) grad_norm 270241.2500 (inf) mem 14543MB +[2023-10-11 11:35:27 simmim_pretrain](main_simmim.py 218): INFO Train: [68/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2468 (0.2500) loss 0.3699 (0.3629) grad_norm 259157.4375 (inf) mem 14543MB +[2023-10-11 11:37:32 simmim_pretrain](main_simmim.py 218): INFO Train: [68/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2455 (0.2500) loss 0.3699 (0.3630) grad_norm 234691.3281 (inf) mem 14543MB +[2023-10-11 11:39:37 simmim_pretrain](main_simmim.py 218): INFO Train: [68/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2459 (0.2500) loss 0.3725 (0.3630) grad_norm 355270.2812 (inf) mem 14543MB +[2023-10-11 11:41:42 simmim_pretrain](main_simmim.py 218): INFO Train: [68/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2464 (0.2500) loss 0.3569 (0.3630) grad_norm 305011.2188 (inf) mem 14543MB +[2023-10-11 11:43:47 simmim_pretrain](main_simmim.py 218): INFO Train: [68/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2519 (0.2499) loss 0.3695 (0.3630) grad_norm 280601.3750 (inf) mem 14543MB +[2023-10-11 11:45:52 simmim_pretrain](main_simmim.py 218): INFO Train: [68/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2517 (0.2499) loss 0.3858 (0.3630) grad_norm 334819.2500 (inf) mem 14543MB +[2023-10-11 11:47:56 simmim_pretrain](main_simmim.py 218): INFO Train: [68/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2488 (0.2499) loss 0.3354 (0.3630) grad_norm 458634.2812 (inf) mem 14543MB +[2023-10-11 11:49:09 simmim_pretrain](main_simmim.py 228): INFO EPOCH 68 training takes 0:28:16 +[2023-10-11 11:49:10 simmim_pretrain](main_simmim.py 218): INFO Train: [69/200][0/6787] eta 2:46:54 lr 0.000200 time 1.4756 (1.4756) loss 0.3376 (0.3376) grad_norm 326984.8750 (326984.8750) mem 14543MB +[2023-10-11 11:51:15 simmim_pretrain](main_simmim.py 218): INFO Train: [69/200][500/6787] eta 0:26:23 lr 0.000200 time 0.2489 (0.2519) loss 0.3692 (0.3630) grad_norm 339139.3750 (inf) mem 14543MB +[2023-10-11 11:53:20 simmim_pretrain](main_simmim.py 218): INFO Train: [69/200][1000/6787] eta 0:24:12 lr 0.000200 time 0.2495 (0.2509) loss 0.3601 (0.3627) grad_norm 287679.7188 (inf) mem 14543MB +[2023-10-11 11:55:25 simmim_pretrain](main_simmim.py 218): INFO Train: [69/200][1500/6787] eta 0:22:04 lr 0.000200 time 0.2519 (0.2506) loss 0.3744 (0.3630) grad_norm 275359.9375 (inf) mem 14543MB +[2023-10-11 11:57:30 simmim_pretrain](main_simmim.py 218): INFO Train: [69/200][2000/6787] eta 0:19:58 lr 0.000200 time 0.2461 (0.2504) loss 0.3774 (0.3628) grad_norm 267005.1250 (inf) mem 14543MB +[2023-10-11 11:59:34 simmim_pretrain](main_simmim.py 218): INFO Train: [69/200][2500/6787] eta 0:17:52 lr 0.000200 time 0.2532 (0.2502) loss 0.3667 (0.3628) grad_norm 445587.9062 (inf) mem 14543MB +[2023-10-11 12:01:39 simmim_pretrain](main_simmim.py 218): INFO Train: [69/200][3000/6787] eta 0:15:47 lr 0.000200 time 0.2493 (0.2502) loss 0.3638 (0.3627) grad_norm 191623.6562 (inf) mem 14543MB +[2023-10-11 12:03:44 simmim_pretrain](main_simmim.py 218): INFO Train: [69/200][3500/6787] eta 0:13:42 lr 0.000200 time 0.2500 (0.2502) loss 0.3533 (0.3627) grad_norm 283179.1875 (inf) mem 14543MB +[2023-10-11 12:05:49 simmim_pretrain](main_simmim.py 218): INFO Train: [69/200][4000/6787] eta 0:11:37 lr 0.000200 time 0.2476 (0.2501) loss 0.3727 (0.3627) grad_norm 287005.5312 (inf) mem 14543MB +[2023-10-11 12:07:54 simmim_pretrain](main_simmim.py 218): INFO Train: [69/200][4500/6787] eta 0:09:32 lr 0.000200 time 0.2481 (0.2501) loss 0.3441 (0.3629) grad_norm 260419.6875 (inf) mem 14543MB +[2023-10-11 12:09:59 simmim_pretrain](main_simmim.py 218): INFO Train: [69/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2496 (0.2501) loss 0.3788 (0.3628) grad_norm 252886.3125 (inf) mem 14543MB +[2023-10-11 12:12:04 simmim_pretrain](main_simmim.py 218): INFO Train: [69/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2516 (0.2501) loss 0.3720 (0.3628) grad_norm 431046.2188 (inf) mem 14543MB +[2023-10-11 12:14:09 simmim_pretrain](main_simmim.py 218): INFO Train: [69/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2523 (0.2501) loss 0.3665 (0.3627) grad_norm 295240.2188 (inf) mem 14543MB +[2023-10-11 12:16:14 simmim_pretrain](main_simmim.py 218): INFO Train: [69/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2463 (0.2501) loss 0.3838 (0.3625) grad_norm 358034.8750 (inf) mem 14543MB +[2023-10-11 12:17:27 simmim_pretrain](main_simmim.py 228): INFO EPOCH 69 training takes 0:28:18 +[2023-10-11 12:17:28 simmim_pretrain](main_simmim.py 218): INFO Train: [70/200][0/6787] eta 2:55:12 lr 0.000200 time 1.5490 (1.5490) loss 0.3655 (0.3655) grad_norm 379997.3750 (379997.3750) mem 14543MB +[2023-10-11 12:19:33 simmim_pretrain](main_simmim.py 218): INFO Train: [70/200][500/6787] eta 0:26:25 lr 0.000200 time 0.2501 (0.2522) loss 0.3458 (0.3645) grad_norm 241836.0000 (290924.5625) mem 14543MB +[2023-10-11 12:21:38 simmim_pretrain](main_simmim.py 218): INFO Train: [70/200][1000/6787] eta 0:24:12 lr 0.000200 time 0.2478 (0.2510) loss 0.3679 (0.3634) grad_norm 198858.2188 (292619.5312) mem 14543MB +[2023-10-11 12:23:43 simmim_pretrain](main_simmim.py 218): INFO Train: [70/200][1500/6787] eta 0:22:05 lr 0.000200 time 0.2523 (0.2506) loss 0.3650 (0.3639) grad_norm 248836.2031 (293684.5938) mem 14543MB +[2023-10-11 12:25:48 simmim_pretrain](main_simmim.py 218): INFO Train: [70/200][2000/6787] eta 0:19:58 lr 0.000200 time 0.2516 (0.2504) loss 0.3665 (0.3634) grad_norm 403359.8438 (346234.5938) mem 14543MB +[2023-10-11 12:27:53 simmim_pretrain](main_simmim.py 218): INFO Train: [70/200][2500/6787] eta 0:17:53 lr 0.000200 time 0.2492 (0.2503) loss 0.3832 (0.3632) grad_norm 183216.5312 (362120.5000) mem 14543MB +[2023-10-11 12:29:58 simmim_pretrain](main_simmim.py 218): INFO Train: [70/200][3000/6787] eta 0:15:47 lr 0.000200 time 0.2467 (0.2503) loss 0.3655 (0.3630) grad_norm 340075.1250 (inf) mem 14543MB +[2023-10-11 12:32:03 simmim_pretrain](main_simmim.py 218): INFO Train: [70/200][3500/6787] eta 0:13:42 lr 0.000200 time 0.2526 (0.2502) loss 0.3470 (0.3632) grad_norm 477674.0625 (inf) mem 14543MB +[2023-10-11 12:34:08 simmim_pretrain](main_simmim.py 218): INFO Train: [70/200][4000/6787] eta 0:11:37 lr 0.000200 time 0.2484 (0.2502) loss 0.3536 (0.3631) grad_norm 222605.5938 (inf) mem 14543MB +[2023-10-11 12:36:13 simmim_pretrain](main_simmim.py 218): INFO Train: [70/200][4500/6787] eta 0:09:32 lr 0.000200 time 0.2517 (0.2502) loss 0.3599 (0.3630) grad_norm 337199.7812 (inf) mem 14543MB +[2023-10-11 12:38:18 simmim_pretrain](main_simmim.py 218): INFO Train: [70/200][5000/6787] eta 0:07:27 lr 0.000200 time 0.2482 (0.2502) loss 0.3541 (0.3630) grad_norm 366515.6250 (inf) mem 14543MB +[2023-10-11 12:40:23 simmim_pretrain](main_simmim.py 218): INFO Train: [70/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2463 (0.2501) loss 0.3852 (0.3630) grad_norm 438358.9062 (inf) mem 14543MB +[2023-10-11 12:42:28 simmim_pretrain](main_simmim.py 218): INFO Train: [70/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2500 (0.2501) loss 0.3463 (0.3629) grad_norm 425843.0938 (inf) mem 14543MB +[2023-10-11 12:44:33 simmim_pretrain](main_simmim.py 218): INFO Train: [70/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2494 (0.2501) loss 0.5558 (0.3640) grad_norm 16006.6729 (inf) mem 14543MB +[2023-10-11 12:45:45 simmim_pretrain](main_simmim.py 228): INFO EPOCH 70 training takes 0:28:17 +[2023-10-11 12:45:46 simmim_pretrain](main_simmim.py 218): INFO Train: [71/200][0/6787] eta 3:00:56 lr 0.000200 time 1.5996 (1.5996) loss 0.4845 (0.4845) grad_norm 17869.7910 (17869.7910) mem 14543MB +[2023-10-11 12:47:51 simmim_pretrain](main_simmim.py 218): INFO Train: [71/200][500/6787] eta 0:26:24 lr 0.000200 time 0.2475 (0.2520) loss 0.4192 (0.4687) grad_norm 71428.1094 (47209.5977) mem 14543MB +[2023-10-11 12:49:56 simmim_pretrain](main_simmim.py 218): INFO Train: [71/200][1000/6787] eta 0:24:11 lr 0.000200 time 0.2485 (0.2508) loss 0.3748 (0.4274) grad_norm 35970.5273 (44199.7617) mem 14543MB +[2023-10-11 12:52:01 simmim_pretrain](main_simmim.py 218): INFO Train: [71/200][1500/6787] eta 0:22:04 lr 0.000200 time 0.2548 (0.2505) loss 0.3567 (0.4093) grad_norm 24828.1074 (40550.5078) mem 14543MB +[2023-10-11 12:54:05 simmim_pretrain](main_simmim.py 218): INFO Train: [71/200][2000/6787] eta 0:19:58 lr 0.000200 time 0.2469 (0.2503) loss 0.3760 (0.3992) grad_norm 43446.0703 (40208.3281) mem 14543MB +[2023-10-11 12:56:10 simmim_pretrain](main_simmim.py 218): INFO Train: [71/200][2500/6787] eta 0:17:52 lr 0.000200 time 0.2465 (0.2502) loss 0.3934 (0.3927) grad_norm 68785.9688 (42080.1523) mem 14543MB +[2023-10-11 12:58:15 simmim_pretrain](main_simmim.py 218): INFO Train: [71/200][3000/6787] eta 0:15:47 lr 0.000200 time 0.2448 (0.2501) loss 0.3601 (0.3883) grad_norm 51108.8906 (44841.0234) mem 14543MB +[2023-10-11 13:00:20 simmim_pretrain](main_simmim.py 218): INFO Train: [71/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2484 (0.2500) loss 0.3635 (0.3851) grad_norm 54616.0352 (46022.2930) mem 14543MB +[2023-10-11 13:02:25 simmim_pretrain](main_simmim.py 218): INFO Train: [71/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2482 (0.2500) loss 0.3467 (0.3828) grad_norm 110293.1016 (48538.6172) mem 14543MB +[2023-10-11 13:04:30 simmim_pretrain](main_simmim.py 218): INFO Train: [71/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2532 (0.2500) loss 0.3648 (0.3807) grad_norm 121720.9609 (52467.9844) mem 14543MB +[2023-10-11 13:06:34 simmim_pretrain](main_simmim.py 218): INFO Train: [71/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2463 (0.2499) loss 0.3804 (0.3790) grad_norm 52323.6992 (56691.4531) mem 14543MB +[2023-10-11 13:08:39 simmim_pretrain](main_simmim.py 218): INFO Train: [71/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2500 (0.2498) loss 0.3665 (0.3776) grad_norm 74710.8516 (60506.1133) mem 14543MB +[2023-10-11 13:10:43 simmim_pretrain](main_simmim.py 218): INFO Train: [71/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2496 (0.2498) loss 0.3732 (0.3764) grad_norm 128692.6875 (66529.7109) mem 14543MB +[2023-10-11 13:12:48 simmim_pretrain](main_simmim.py 218): INFO Train: [71/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2502 (0.2498) loss 0.3521 (0.3753) grad_norm 196150.9531 (72503.0625) mem 14543MB +[2023-10-11 13:14:00 simmim_pretrain](main_simmim.py 228): INFO EPOCH 71 training takes 0:28:15 +[2023-10-11 13:14:02 simmim_pretrain](main_simmim.py 218): INFO Train: [72/200][0/6787] eta 2:51:26 lr 0.000200 time 1.5156 (1.5156) loss 0.3558 (0.3558) grad_norm 125773.7266 (125773.7266) mem 14543MB +[2023-10-11 13:16:07 simmim_pretrain](main_simmim.py 218): INFO Train: [72/200][500/6787] eta 0:26:25 lr 0.000200 time 0.2576 (0.2522) loss 0.3832 (0.3619) grad_norm 137979.9688 (182666.3594) mem 14543MB +[2023-10-11 13:18:12 simmim_pretrain](main_simmim.py 218): INFO Train: [72/200][1000/6787] eta 0:24:11 lr 0.000200 time 0.2500 (0.2509) loss 0.3589 (0.3622) grad_norm 172224.1250 (199330.5469) mem 14543MB +[2023-10-11 13:20:17 simmim_pretrain](main_simmim.py 218): INFO Train: [72/200][1500/6787] eta 0:22:04 lr 0.000200 time 0.2550 (0.2506) loss 0.3438 (0.3615) grad_norm 447376.9062 (245628.9531) mem 14543MB +[2023-10-11 13:22:22 simmim_pretrain](main_simmim.py 218): INFO Train: [72/200][2000/6787] eta 0:19:58 lr 0.000200 time 0.2496 (0.2504) loss 0.3485 (0.3612) grad_norm 164869.8125 (273051.0938) mem 14543MB +[2023-10-11 13:24:27 simmim_pretrain](main_simmim.py 218): INFO Train: [72/200][2500/6787] eta 0:17:53 lr 0.000200 time 0.2474 (0.2503) loss 0.3672 (0.3611) grad_norm 458103.4062 (312458.3750) mem 14543MB +[2023-10-11 13:26:32 simmim_pretrain](main_simmim.py 218): INFO Train: [72/200][3000/6787] eta 0:15:47 lr 0.000200 time 0.2455 (0.2503) loss 0.3961 (0.3611) grad_norm 334672.4688 (321120.7500) mem 14543MB +[2023-10-11 13:28:37 simmim_pretrain](main_simmim.py 218): INFO Train: [72/200][3500/6787] eta 0:13:42 lr 0.000200 time 0.2546 (0.2502) loss 0.3627 (0.3610) grad_norm 455840.0312 (342745.6250) mem 14543MB +[2023-10-11 13:30:42 simmim_pretrain](main_simmim.py 218): INFO Train: [72/200][4000/6787] eta 0:11:37 lr 0.000200 time 0.2516 (0.2502) loss 0.3711 (0.3611) grad_norm 275151.7500 (inf) mem 14543MB +[2023-10-11 13:32:47 simmim_pretrain](main_simmim.py 218): INFO Train: [72/200][4500/6787] eta 0:09:32 lr 0.000200 time 0.2538 (0.2502) loss 0.3518 (0.3612) grad_norm 230955.5625 (inf) mem 14543MB +[2023-10-11 13:34:52 simmim_pretrain](main_simmim.py 218): INFO Train: [72/200][5000/6787] eta 0:07:27 lr 0.000200 time 0.2515 (0.2502) loss 0.3756 (0.3614) grad_norm 113382.4375 (inf) mem 14543MB +[2023-10-11 13:36:57 simmim_pretrain](main_simmim.py 218): INFO Train: [72/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2464 (0.2502) loss 0.3721 (0.3617) grad_norm 202803.6406 (inf) mem 14543MB +[2023-10-11 13:39:02 simmim_pretrain](main_simmim.py 218): INFO Train: [72/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2514 (0.2502) loss 0.3502 (0.3617) grad_norm 164396.1094 (inf) mem 14543MB +[2023-10-11 13:41:07 simmim_pretrain](main_simmim.py 218): INFO Train: [72/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2455 (0.2502) loss 0.3586 (0.3617) grad_norm 372818.0000 (inf) mem 14543MB +[2023-10-11 13:42:19 simmim_pretrain](main_simmim.py 228): INFO EPOCH 72 training takes 0:28:18 +[2023-10-11 13:42:20 simmim_pretrain](main_simmim.py 218): INFO Train: [73/200][0/6787] eta 2:51:43 lr 0.000200 time 1.5182 (1.5182) loss 0.3679 (0.3679) grad_norm 403169.1250 (403169.1250) mem 14543MB +[2023-10-11 13:44:25 simmim_pretrain](main_simmim.py 218): INFO Train: [73/200][500/6787] eta 0:26:19 lr 0.000200 time 0.2459 (0.2512) loss 0.3514 (0.3614) grad_norm 358541.9688 (354899.9375) mem 14543MB +[2023-10-11 13:46:29 simmim_pretrain](main_simmim.py 218): INFO Train: [73/200][1000/6787] eta 0:24:08 lr 0.000200 time 0.2499 (0.2502) loss 0.3513 (0.3611) grad_norm 309928.3125 (368312.7500) mem 14543MB +[2023-10-11 13:48:34 simmim_pretrain](main_simmim.py 218): INFO Train: [73/200][1500/6787] eta 0:22:01 lr 0.000200 time 0.2463 (0.2499) loss 0.3622 (0.3611) grad_norm 474251.8750 (inf) mem 14543MB +[2023-10-11 13:50:39 simmim_pretrain](main_simmim.py 218): INFO Train: [73/200][2000/6787] eta 0:19:56 lr 0.000200 time 0.2446 (0.2498) loss 0.3790 (0.3612) grad_norm 607006.0625 (inf) mem 14543MB +[2023-10-11 13:52:44 simmim_pretrain](main_simmim.py 218): INFO Train: [73/200][2500/6787] eta 0:17:51 lr 0.000200 time 0.2465 (0.2498) loss 0.3406 (0.3610) grad_norm 487118.5625 (inf) mem 14543MB +[2023-10-11 13:54:49 simmim_pretrain](main_simmim.py 218): INFO Train: [73/200][3000/6787] eta 0:15:46 lr 0.000200 time 0.2529 (0.2498) loss 0.3560 (0.3611) grad_norm 320547.8125 (inf) mem 14543MB +[2023-10-11 13:56:54 simmim_pretrain](main_simmim.py 218): INFO Train: [73/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2462 (0.2499) loss 0.3710 (0.3611) grad_norm 330880.5625 (inf) mem 14543MB +[2023-10-11 13:58:59 simmim_pretrain](main_simmim.py 218): INFO Train: [73/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2487 (0.2499) loss 0.3595 (0.3612) grad_norm 297180.5000 (inf) mem 14543MB +[2023-10-11 14:01:03 simmim_pretrain](main_simmim.py 218): INFO Train: [73/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2498 (0.2498) loss 0.3578 (0.3614) grad_norm 346602.9062 (inf) mem 14543MB +[2023-10-11 14:03:08 simmim_pretrain](main_simmim.py 218): INFO Train: [73/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2470 (0.2498) loss 0.3749 (0.3615) grad_norm 308222.3438 (inf) mem 14543MB +[2023-10-11 14:05:13 simmim_pretrain](main_simmim.py 218): INFO Train: [73/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2469 (0.2498) loss 0.3708 (0.3614) grad_norm 141277.2344 (inf) mem 14543MB +[2023-10-11 14:07:18 simmim_pretrain](main_simmim.py 218): INFO Train: [73/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2520 (0.2498) loss 0.3703 (0.3614) grad_norm 304034.0000 (inf) mem 14543MB +[2023-10-11 14:09:23 simmim_pretrain](main_simmim.py 218): INFO Train: [73/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2463 (0.2498) loss 0.3370 (0.3617) grad_norm 178278.9375 (inf) mem 14543MB +[2023-10-11 14:10:35 simmim_pretrain](main_simmim.py 228): INFO EPOCH 73 training takes 0:28:16 +[2023-10-11 14:10:37 simmim_pretrain](main_simmim.py 218): INFO Train: [74/200][0/6787] eta 2:51:53 lr 0.000200 time 1.5196 (1.5196) loss 0.3592 (0.3592) grad_norm 235316.8281 (235316.8281) mem 14543MB +[2023-10-11 14:12:41 simmim_pretrain](main_simmim.py 218): INFO Train: [74/200][500/6787] eta 0:26:23 lr 0.000200 time 0.2501 (0.2519) loss 0.3644 (0.3642) grad_norm 147381.8281 (135425.0625) mem 14543MB +[2023-10-11 14:14:46 simmim_pretrain](main_simmim.py 218): INFO Train: [74/200][1000/6787] eta 0:24:11 lr 0.000200 time 0.2467 (0.2509) loss 0.3613 (0.3641) grad_norm 91527.8516 (130933.0547) mem 14543MB +[2023-10-11 14:16:51 simmim_pretrain](main_simmim.py 218): INFO Train: [74/200][1500/6787] eta 0:22:04 lr 0.000200 time 0.2453 (0.2505) loss 0.3784 (0.3643) grad_norm 211484.9688 (130610.3516) mem 14543MB +[2023-10-11 14:18:56 simmim_pretrain](main_simmim.py 218): INFO Train: [74/200][2000/6787] eta 0:19:58 lr 0.000200 time 0.2489 (0.2504) loss 0.3523 (0.3640) grad_norm 190343.1406 (143885.9219) mem 14543MB +[2023-10-11 14:21:01 simmim_pretrain](main_simmim.py 218): INFO Train: [74/200][2500/6787] eta 0:17:52 lr 0.000200 time 0.2468 (0.2503) loss 0.3489 (0.3636) grad_norm 287275.7188 (160661.9062) mem 14543MB +[2023-10-11 14:23:06 simmim_pretrain](main_simmim.py 218): INFO Train: [74/200][3000/6787] eta 0:15:47 lr 0.000200 time 0.2482 (0.2503) loss 0.3685 (0.3633) grad_norm 210228.7812 (166119.8281) mem 14543MB +[2023-10-11 14:25:11 simmim_pretrain](main_simmim.py 218): INFO Train: [74/200][3500/6787] eta 0:13:42 lr 0.000200 time 0.2468 (0.2502) loss 0.3684 (0.3634) grad_norm 177964.5312 (171908.0000) mem 14543MB +[2023-10-11 14:27:16 simmim_pretrain](main_simmim.py 218): INFO Train: [74/200][4000/6787] eta 0:11:37 lr 0.000200 time 0.2500 (0.2502) loss 0.3702 (0.3631) grad_norm 448338.3125 (184502.0312) mem 14543MB +[2023-10-11 14:29:21 simmim_pretrain](main_simmim.py 218): INFO Train: [74/200][4500/6787] eta 0:09:32 lr 0.000200 time 0.2509 (0.2501) loss 0.3485 (0.3628) grad_norm 389904.9375 (201466.6719) mem 14543MB +[2023-10-11 14:31:26 simmim_pretrain](main_simmim.py 218): INFO Train: [74/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2487 (0.2501) loss 0.3738 (0.3627) grad_norm 589308.5000 (217718.3750) mem 14543MB +[2023-10-11 14:33:31 simmim_pretrain](main_simmim.py 218): INFO Train: [74/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2469 (0.2501) loss 0.3685 (0.3627) grad_norm 362302.9688 (inf) mem 14543MB +[2023-10-11 14:35:36 simmim_pretrain](main_simmim.py 218): INFO Train: [74/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2499 (0.2501) loss 0.3837 (0.3625) grad_norm 462494.1250 (inf) mem 14543MB +[2023-10-11 14:37:41 simmim_pretrain](main_simmim.py 218): INFO Train: [74/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2487 (0.2501) loss 0.3586 (0.3623) grad_norm 214753.9531 (inf) mem 14543MB +[2023-10-11 14:38:53 simmim_pretrain](main_simmim.py 228): INFO EPOCH 74 training takes 0:28:18 +[2023-10-11 14:38:55 simmim_pretrain](main_simmim.py 218): INFO Train: [75/200][0/6787] eta 2:49:56 lr 0.000200 time 1.5023 (1.5023) loss 0.3743 (0.3743) grad_norm 473136.8438 (473136.8438) mem 14543MB +[2023-10-11 14:41:00 simmim_pretrain](main_simmim.py 218): INFO Train: [75/200][500/6787] eta 0:26:24 lr 0.000200 time 0.2530 (0.2519) loss 0.3643 (0.3612) grad_norm 401562.3438 (498453.3750) mem 14543MB +[2023-10-11 14:43:05 simmim_pretrain](main_simmim.py 218): INFO Train: [75/200][1000/6787] eta 0:24:12 lr 0.000200 time 0.2492 (0.2509) loss 0.3621 (0.3613) grad_norm 355571.7812 (inf) mem 14543MB +[2023-10-11 14:45:10 simmim_pretrain](main_simmim.py 218): INFO Train: [75/200][1500/6787] eta 0:22:04 lr 0.000200 time 0.2489 (0.2506) loss 0.3686 (0.3616) grad_norm 374324.2500 (inf) mem 14543MB +[2023-10-11 14:47:15 simmim_pretrain](main_simmim.py 218): INFO Train: [75/200][2000/6787] eta 0:19:58 lr 0.000200 time 0.2504 (0.2505) loss 0.3440 (0.3618) grad_norm 224979.5312 (inf) mem 14543MB +[2023-10-11 14:49:19 simmim_pretrain](main_simmim.py 218): INFO Train: [75/200][2500/6787] eta 0:17:52 lr 0.000200 time 0.2508 (0.2503) loss 0.3786 (0.3620) grad_norm 132924.9219 (inf) mem 14543MB +[2023-10-11 14:51:24 simmim_pretrain](main_simmim.py 218): INFO Train: [75/200][3000/6787] eta 0:15:47 lr 0.000200 time 0.2468 (0.2503) loss 0.3687 (0.3621) grad_norm 161180.7188 (inf) mem 14543MB +[2023-10-11 14:53:30 simmim_pretrain](main_simmim.py 218): INFO Train: [75/200][3500/6787] eta 0:13:42 lr 0.000200 time 0.2582 (0.2502) loss 0.3826 (0.3620) grad_norm 627796.8125 (inf) mem 14543MB +[2023-10-11 14:55:34 simmim_pretrain](main_simmim.py 218): INFO Train: [75/200][4000/6787] eta 0:11:37 lr 0.000200 time 0.2449 (0.2502) loss 0.3509 (0.3619) grad_norm 363978.8438 (inf) mem 14543MB +[2023-10-11 14:57:39 simmim_pretrain](main_simmim.py 218): INFO Train: [75/200][4500/6787] eta 0:09:32 lr 0.000200 time 0.2460 (0.2502) loss 0.3666 (0.3620) grad_norm 219696.4219 (inf) mem 14543MB +[2023-10-11 14:59:44 simmim_pretrain](main_simmim.py 218): INFO Train: [75/200][5000/6787] eta 0:07:27 lr 0.000200 time 0.2466 (0.2502) loss 0.3493 (0.3619) grad_norm 174469.9375 (inf) mem 14543MB +[2023-10-11 15:01:49 simmim_pretrain](main_simmim.py 218): INFO Train: [75/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2589 (0.2501) loss 0.3502 (0.3619) grad_norm 237777.6875 (inf) mem 14543MB +[2023-10-11 15:03:54 simmim_pretrain](main_simmim.py 218): INFO Train: [75/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2489 (0.2501) loss 0.3609 (0.3619) grad_norm 240592.1250 (inf) mem 14543MB +[2023-10-11 15:05:59 simmim_pretrain](main_simmim.py 218): INFO Train: [75/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2515 (0.2501) loss 0.3725 (0.3618) grad_norm 801763.3750 (inf) mem 14543MB +[2023-10-11 15:07:12 simmim_pretrain](main_simmim.py 228): INFO EPOCH 75 training takes 0:28:18 +[2023-10-11 15:07:13 simmim_pretrain](main_simmim.py 218): INFO Train: [76/200][0/6787] eta 2:58:29 lr 0.000200 time 1.5780 (1.5780) loss 0.3845 (0.3845) grad_norm 392027.3438 (392027.3438) mem 14543MB +[2023-10-11 15:09:18 simmim_pretrain](main_simmim.py 218): INFO Train: [76/200][500/6787] eta 0:26:24 lr 0.000200 time 0.2538 (0.2521) loss 0.3667 (0.3606) grad_norm 292784.3750 (387393.4688) mem 14543MB +[2023-10-11 15:11:23 simmim_pretrain](main_simmim.py 218): INFO Train: [76/200][1000/6787] eta 0:24:12 lr 0.000200 time 0.2473 (0.2509) loss 0.3499 (0.3613) grad_norm 184588.2656 (inf) mem 14543MB +[2023-10-11 15:13:28 simmim_pretrain](main_simmim.py 218): INFO Train: [76/200][1500/6787] eta 0:22:04 lr 0.000200 time 0.2499 (0.2505) loss 0.3435 (0.3616) grad_norm 166362.0312 (inf) mem 14543MB +[2023-10-11 15:15:33 simmim_pretrain](main_simmim.py 218): INFO Train: [76/200][2000/6787] eta 0:19:58 lr 0.000200 time 0.2532 (0.2503) loss 0.3577 (0.3616) grad_norm 220921.1094 (inf) mem 14543MB +[2023-10-11 15:17:38 simmim_pretrain](main_simmim.py 218): INFO Train: [76/200][2500/6787] eta 0:17:52 lr 0.000200 time 0.2529 (0.2502) loss 0.3761 (0.3617) grad_norm 278814.4688 (inf) mem 14543MB +[2023-10-11 15:19:43 simmim_pretrain](main_simmim.py 218): INFO Train: [76/200][3000/6787] eta 0:15:47 lr 0.000200 time 0.2559 (0.2502) loss 0.3649 (0.3618) grad_norm 548531.9375 (inf) mem 14543MB +[2023-10-11 15:21:47 simmim_pretrain](main_simmim.py 218): INFO Train: [76/200][3500/6787] eta 0:13:42 lr 0.000200 time 0.2506 (0.2501) loss 0.3596 (0.3619) grad_norm 196466.1094 (inf) mem 14543MB +[2023-10-11 15:23:52 simmim_pretrain](main_simmim.py 218): INFO Train: [76/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2468 (0.2501) loss 0.3490 (0.3621) grad_norm 236983.8281 (inf) mem 14543MB +[2023-10-11 15:25:57 simmim_pretrain](main_simmim.py 218): INFO Train: [76/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2493 (0.2500) loss 0.3691 (0.3621) grad_norm 183180.9531 (inf) mem 14543MB +[2023-10-11 15:28:02 simmim_pretrain](main_simmim.py 218): INFO Train: [76/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2472 (0.2500) loss 0.3562 (0.3622) grad_norm 276836.8438 (inf) mem 14543MB +[2023-10-11 15:30:07 simmim_pretrain](main_simmim.py 218): INFO Train: [76/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2479 (0.2500) loss 0.3625 (0.3621) grad_norm 195489.7031 (inf) mem 14543MB +[2023-10-11 15:32:12 simmim_pretrain](main_simmim.py 218): INFO Train: [76/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2506 (0.2500) loss 0.3577 (0.3621) grad_norm 350405.1250 (inf) mem 14543MB +[2023-10-11 15:34:17 simmim_pretrain](main_simmim.py 218): INFO Train: [76/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2529 (0.2500) loss 0.3712 (0.3621) grad_norm 190940.9375 (inf) mem 14543MB +[2023-10-11 15:35:29 simmim_pretrain](main_simmim.py 228): INFO EPOCH 76 training takes 0:28:17 +[2023-10-11 15:35:31 simmim_pretrain](main_simmim.py 218): INFO Train: [77/200][0/6787] eta 3:01:30 lr 0.000200 time 1.6046 (1.6046) loss 0.3636 (0.3636) grad_norm 181944.9688 (181944.9688) mem 14543MB +[2023-10-11 15:37:36 simmim_pretrain](main_simmim.py 218): INFO Train: [77/200][500/6787] eta 0:26:25 lr 0.000200 time 0.2520 (0.2521) loss 0.3796 (0.3618) grad_norm 189700.7812 (nan) mem 14543MB +[2023-10-11 15:39:40 simmim_pretrain](main_simmim.py 218): INFO Train: [77/200][1000/6787] eta 0:24:11 lr 0.000200 time 0.2547 (0.2508) loss 0.3549 (0.3621) grad_norm 240478.3594 (nan) mem 14543MB +[2023-10-11 15:41:45 simmim_pretrain](main_simmim.py 218): INFO Train: [77/200][1500/6787] eta 0:22:04 lr 0.000200 time 0.2499 (0.2505) loss 0.3316 (0.3623) grad_norm 131342.3906 (nan) mem 14543MB +[2023-10-11 15:43:50 simmim_pretrain](main_simmim.py 218): INFO Train: [77/200][2000/6787] eta 0:19:58 lr 0.000200 time 0.2473 (0.2504) loss 0.3599 (0.3622) grad_norm 176176.1406 (nan) mem 14543MB +[2023-10-11 15:45:55 simmim_pretrain](main_simmim.py 218): INFO Train: [77/200][2500/6787] eta 0:17:52 lr 0.000200 time 0.2463 (0.2503) loss 0.3551 (0.3621) grad_norm 196923.6250 (nan) mem 14543MB +[2023-10-11 15:48:00 simmim_pretrain](main_simmim.py 218): INFO Train: [77/200][3000/6787] eta 0:15:47 lr 0.000200 time 0.2537 (0.2502) loss 0.3863 (0.3617) grad_norm 781097.8125 (nan) mem 14543MB +[2023-10-11 15:50:05 simmim_pretrain](main_simmim.py 218): INFO Train: [77/200][3500/6787] eta 0:13:42 lr 0.000200 time 0.2472 (0.2502) loss 0.3788 (0.3616) grad_norm 429625.0000 (nan) mem 14543MB +[2023-10-11 15:52:10 simmim_pretrain](main_simmim.py 218): INFO Train: [77/200][4000/6787] eta 0:11:37 lr 0.000200 time 0.2477 (0.2502) loss 0.3408 (0.3615) grad_norm 366762.7812 (nan) mem 14543MB +[2023-10-11 15:54:15 simmim_pretrain](main_simmim.py 218): INFO Train: [77/200][4500/6787] eta 0:09:32 lr 0.000200 time 0.2521 (0.2502) loss 0.3799 (0.3617) grad_norm 168418.0938 (nan) mem 14543MB +[2023-10-11 15:56:20 simmim_pretrain](main_simmim.py 218): INFO Train: [77/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2461 (0.2501) loss 0.3514 (0.3620) grad_norm 160842.2031 (nan) mem 14543MB +[2023-10-11 15:58:25 simmim_pretrain](main_simmim.py 218): INFO Train: [77/200][5500/6787] eta 0:05:21 lr 0.000200 time 0.2466 (0.2500) loss 0.3614 (0.3622) grad_norm 74557.4062 (nan) mem 14543MB +[2023-10-11 16:00:29 simmim_pretrain](main_simmim.py 218): INFO Train: [77/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2482 (0.2499) loss 0.3413 (0.3625) grad_norm 72795.3281 (nan) mem 14543MB +[2023-10-11 16:02:33 simmim_pretrain](main_simmim.py 218): INFO Train: [77/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2488 (0.2498) loss 0.3692 (0.3626) grad_norm 208724.7188 (nan) mem 14543MB +[2023-10-11 16:03:45 simmim_pretrain](main_simmim.py 228): INFO EPOCH 77 training takes 0:28:16 +[2023-10-11 16:03:47 simmim_pretrain](main_simmim.py 218): INFO Train: [78/200][0/6787] eta 2:39:15 lr 0.000200 time 1.4079 (1.4079) loss 0.3440 (0.3440) grad_norm 234052.7656 (234052.7656) mem 14543MB +[2023-10-11 16:05:51 simmim_pretrain](main_simmim.py 218): INFO Train: [78/200][500/6787] eta 0:26:22 lr 0.000200 time 0.2493 (0.2517) loss 0.3596 (0.3609) grad_norm 225183.4375 (201117.8125) mem 14543MB +[2023-10-11 16:07:57 simmim_pretrain](main_simmim.py 218): INFO Train: [78/200][1000/6787] eta 0:24:15 lr 0.000200 time 0.2542 (0.2516) loss 0.3324 (0.3614) grad_norm 243454.3906 (214886.5156) mem 14543MB +[2023-10-11 16:10:04 simmim_pretrain](main_simmim.py 218): INFO Train: [78/200][1500/6787] eta 0:22:12 lr 0.000200 time 0.2487 (0.2521) loss 0.3631 (0.3613) grad_norm 205800.1094 (225255.0000) mem 14543MB +[2023-10-11 16:12:11 simmim_pretrain](main_simmim.py 218): INFO Train: [78/200][2000/6787] eta 0:20:09 lr 0.000200 time 0.2551 (0.2526) loss 0.3501 (0.3612) grad_norm 332029.8125 (243493.6094) mem 14543MB +[2023-10-11 16:14:19 simmim_pretrain](main_simmim.py 218): INFO Train: [78/200][2500/6787] eta 0:18:06 lr 0.000200 time 0.2589 (0.2534) loss 0.3330 (0.3613) grad_norm 560723.2500 (273898.3125) mem 14543MB +[2023-10-11 16:16:27 simmim_pretrain](main_simmim.py 218): INFO Train: [78/200][3000/6787] eta 0:16:00 lr 0.000200 time 0.2519 (0.2537) loss 0.3506 (0.3612) grad_norm 176057.0625 (292745.9062) mem 14543MB +[2023-10-11 16:18:35 simmim_pretrain](main_simmim.py 218): INFO Train: [78/200][3500/6787] eta 0:13:55 lr 0.000200 time 0.2577 (0.2541) loss 0.3642 (0.3610) grad_norm 426102.3438 (311127.1562) mem 14543MB +[2023-10-11 16:20:43 simmim_pretrain](main_simmim.py 218): INFO Train: [78/200][4000/6787] eta 0:11:49 lr 0.000200 time 0.2540 (0.2545) loss 0.3668 (0.3611) grad_norm 453320.5000 (inf) mem 14543MB +[2023-10-11 16:22:52 simmim_pretrain](main_simmim.py 218): INFO Train: [78/200][4500/6787] eta 0:09:42 lr 0.000200 time 0.2584 (0.2547) loss 0.3489 (0.3614) grad_norm 162307.7500 (inf) mem 14543MB +[2023-10-11 16:25:00 simmim_pretrain](main_simmim.py 218): INFO Train: [78/200][5000/6787] eta 0:07:35 lr 0.000200 time 0.2564 (0.2549) loss 0.3603 (0.3615) grad_norm 173276.1094 (inf) mem 14543MB +[2023-10-11 16:27:10 simmim_pretrain](main_simmim.py 218): INFO Train: [78/200][5500/6787] eta 0:05:28 lr 0.000200 time 0.2601 (0.2553) loss 0.3583 (0.3615) grad_norm 251165.3125 (inf) mem 14543MB +[2023-10-11 16:29:20 simmim_pretrain](main_simmim.py 218): INFO Train: [78/200][6000/6787] eta 0:03:21 lr 0.000200 time 0.2543 (0.2557) loss 0.3738 (0.3615) grad_norm 341415.2500 (inf) mem 14543MB +[2023-10-11 16:31:30 simmim_pretrain](main_simmim.py 218): INFO Train: [78/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2512 (0.2560) loss 0.3524 (0.3615) grad_norm 320658.1250 (inf) mem 14543MB +[2023-10-11 16:32:45 simmim_pretrain](main_simmim.py 228): INFO EPOCH 78 training takes 0:28:59 +[2023-10-11 16:32:47 simmim_pretrain](main_simmim.py 218): INFO Train: [79/200][0/6787] eta 2:58:46 lr 0.000200 time 1.5805 (1.5805) loss 0.3542 (0.3542) grad_norm 284096.3125 (284096.3125) mem 14543MB +[2023-10-11 16:34:53 simmim_pretrain](main_simmim.py 218): INFO Train: [79/200][500/6787] eta 0:26:40 lr 0.000200 time 0.2592 (0.2546) loss 0.3562 (0.3602) grad_norm 598060.8125 (441873.5312) mem 14543MB +[2023-10-11 16:36:59 simmim_pretrain](main_simmim.py 218): INFO Train: [79/200][1000/6787] eta 0:24:26 lr 0.000200 time 0.2484 (0.2534) loss 0.3736 (0.3604) grad_norm 376229.7188 (inf) mem 14543MB +[2023-10-11 16:39:05 simmim_pretrain](main_simmim.py 218): INFO Train: [79/200][1500/6787] eta 0:22:17 lr 0.000200 time 0.2519 (0.2530) loss 0.3801 (0.3607) grad_norm 369283.5312 (inf) mem 14543MB +[2023-10-11 16:41:12 simmim_pretrain](main_simmim.py 218): INFO Train: [79/200][2000/6787] eta 0:20:13 lr 0.000200 time 0.2545 (0.2535) loss 0.3681 (0.3610) grad_norm 221120.5469 (inf) mem 14543MB +[2023-10-11 16:43:19 simmim_pretrain](main_simmim.py 218): INFO Train: [79/200][2500/6787] eta 0:18:07 lr 0.000200 time 0.2511 (0.2536) loss 0.3588 (0.3612) grad_norm 348191.0938 (inf) mem 14543MB +[2023-10-11 16:45:26 simmim_pretrain](main_simmim.py 218): INFO Train: [79/200][3000/6787] eta 0:15:59 lr 0.000200 time 0.2567 (0.2534) loss 0.3691 (0.3613) grad_norm 258462.2188 (inf) mem 14543MB +[2023-10-11 16:47:32 simmim_pretrain](main_simmim.py 218): INFO Train: [79/200][3500/6787] eta 0:13:52 lr 0.000200 time 0.2536 (0.2533) loss 0.3645 (0.3610) grad_norm 322938.9688 (inf) mem 14543MB +[2023-10-11 16:49:38 simmim_pretrain](main_simmim.py 218): INFO Train: [79/200][4000/6787] eta 0:11:45 lr 0.000200 time 0.2478 (0.2532) loss 0.3653 (0.3613) grad_norm 166447.4531 (inf) mem 14543MB +[2023-10-11 16:51:45 simmim_pretrain](main_simmim.py 218): INFO Train: [79/200][4500/6787] eta 0:09:39 lr 0.000200 time 0.2506 (0.2533) loss 0.3666 (0.3615) grad_norm 354756.0625 (inf) mem 14543MB +[2023-10-11 16:53:51 simmim_pretrain](main_simmim.py 218): INFO Train: [79/200][5000/6787] eta 0:07:32 lr 0.000200 time 0.2529 (0.2532) loss 0.3596 (0.3616) grad_norm 384136.1875 (inf) mem 14543MB +[2023-10-11 16:55:58 simmim_pretrain](main_simmim.py 218): INFO Train: [79/200][5500/6787] eta 0:05:25 lr 0.000200 time 0.2511 (0.2532) loss 0.3690 (0.3617) grad_norm 499219.9688 (inf) mem 14543MB +[2023-10-11 16:58:04 simmim_pretrain](main_simmim.py 218): INFO Train: [79/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2516 (0.2531) loss 0.3602 (0.3615) grad_norm 352363.7500 (inf) mem 14543MB +[2023-10-11 17:00:10 simmim_pretrain](main_simmim.py 218): INFO Train: [79/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2600 (0.2531) loss 0.3549 (0.3615) grad_norm 362080.4688 (inf) mem 14543MB +[2023-10-11 17:01:24 simmim_pretrain](main_simmim.py 228): INFO EPOCH 79 training takes 0:28:38 +[2023-10-11 17:01:25 simmim_pretrain](main_simmim.py 218): INFO Train: [80/200][0/6787] eta 2:49:58 lr 0.000200 time 1.5027 (1.5027) loss 0.3561 (0.3561) grad_norm 392068.9062 (392068.9062) mem 14543MB +[2023-10-11 17:03:31 simmim_pretrain](main_simmim.py 218): INFO Train: [80/200][500/6787] eta 0:26:40 lr 0.000200 time 0.2542 (0.2546) loss 0.3543 (0.3606) grad_norm 386334.5625 (inf) mem 14543MB +[2023-10-11 17:05:38 simmim_pretrain](main_simmim.py 218): INFO Train: [80/200][1000/6787] eta 0:24:29 lr 0.000200 time 0.2489 (0.2538) loss 0.4944 (0.4158) grad_norm 41777.4336 (inf) mem 14543MB +[2023-10-11 17:07:46 simmim_pretrain](main_simmim.py 218): INFO Train: [80/200][1500/6787] eta 0:22:27 lr 0.000200 time 0.2578 (0.2549) loss 0.3775 (0.4177) grad_norm 29608.2852 (inf) mem 14543MB +[2023-10-11 17:09:56 simmim_pretrain](main_simmim.py 218): INFO Train: [80/200][2000/6787] eta 0:20:25 lr 0.000200 time 0.2609 (0.2559) loss 0.3715 (0.4066) grad_norm 34911.1953 (inf) mem 14543MB +[2023-10-11 17:12:05 simmim_pretrain](main_simmim.py 218): INFO Train: [80/200][2500/6787] eta 0:18:19 lr 0.000200 time 0.2522 (0.2564) loss 0.3555 (0.3991) grad_norm 30530.7637 (inf) mem 14543MB +[2023-10-11 17:14:13 simmim_pretrain](main_simmim.py 218): INFO Train: [80/200][3000/6787] eta 0:16:11 lr 0.000200 time 0.2571 (0.2565) loss 0.3648 (0.3939) grad_norm 49212.6211 (inf) mem 14543MB +[2023-10-11 17:16:22 simmim_pretrain](main_simmim.py 218): INFO Train: [80/200][3500/6787] eta 0:14:03 lr 0.000200 time 0.2583 (0.2567) loss 0.3896 (0.3898) grad_norm 43617.4805 (inf) mem 14543MB +[2023-10-11 17:18:31 simmim_pretrain](main_simmim.py 218): INFO Train: [80/200][4000/6787] eta 0:11:55 lr 0.000200 time 0.2573 (0.2567) loss 0.3550 (0.3868) grad_norm 53002.8789 (inf) mem 14543MB +[2023-10-11 17:20:38 simmim_pretrain](main_simmim.py 218): INFO Train: [80/200][4500/6787] eta 0:09:46 lr 0.000200 time 0.2511 (0.2564) loss 0.3633 (0.3843) grad_norm 44977.1523 (inf) mem 14543MB +[2023-10-11 17:22:44 simmim_pretrain](main_simmim.py 218): INFO Train: [80/200][5000/6787] eta 0:07:37 lr 0.000200 time 0.2477 (0.2561) loss 0.3480 (0.3823) grad_norm 46885.5469 (inf) mem 14543MB +[2023-10-11 17:24:52 simmim_pretrain](main_simmim.py 218): INFO Train: [80/200][5500/6787] eta 0:05:29 lr 0.000200 time 0.2513 (0.2560) loss 0.3505 (0.3804) grad_norm 72134.4531 (inf) mem 14543MB +[2023-10-11 17:27:00 simmim_pretrain](main_simmim.py 218): INFO Train: [80/200][6000/6787] eta 0:03:21 lr 0.000200 time 0.2508 (0.2559) loss 0.3730 (0.3790) grad_norm 192133.0469 (inf) mem 14543MB +[2023-10-11 17:29:07 simmim_pretrain](main_simmim.py 218): INFO Train: [80/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2571 (0.2558) loss 0.3550 (0.3778) grad_norm 133953.7812 (inf) mem 14543MB +[2023-10-11 17:30:22 simmim_pretrain](main_simmim.py 228): INFO EPOCH 80 training takes 0:28:58 +[2023-10-11 17:30:22 simmim_pretrain](utils.py 62): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_80.pth saving...... +[2023-10-11 17:30:23 simmim_pretrain](utils.py 64): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_80.pth saved !!! +[2023-10-11 17:30:24 simmim_pretrain](main_simmim.py 218): INFO Train: [81/200][0/6787] eta 2:48:38 lr 0.000200 time 1.4909 (1.4909) loss 0.3771 (0.3771) grad_norm 214235.6719 (214235.6719) mem 14543MB +[2023-10-11 17:32:31 simmim_pretrain](main_simmim.py 218): INFO Train: [81/200][500/6787] eta 0:26:55 lr 0.000200 time 0.2536 (0.2569) loss 0.3627 (0.3618) grad_norm 281367.5625 (138115.6875) mem 14543MB +[2023-10-11 17:34:39 simmim_pretrain](main_simmim.py 218): INFO Train: [81/200][1000/6787] eta 0:24:43 lr 0.000200 time 0.2565 (0.2563) loss 0.3433 (0.3618) grad_norm 121110.8594 (144462.2812) mem 14543MB +[2023-10-11 17:36:47 simmim_pretrain](main_simmim.py 218): INFO Train: [81/200][1500/6787] eta 0:22:33 lr 0.000200 time 0.2591 (0.2561) loss 0.3675 (0.3615) grad_norm 192743.3750 (167388.7969) mem 14543MB +[2023-10-11 17:38:54 simmim_pretrain](main_simmim.py 218): INFO Train: [81/200][2000/6787] eta 0:20:24 lr 0.000200 time 0.2534 (0.2557) loss 0.3458 (0.3615) grad_norm 216679.9688 (169846.1562) mem 14543MB +[2023-10-11 17:41:03 simmim_pretrain](main_simmim.py 218): INFO Train: [81/200][2500/6787] eta 0:18:18 lr 0.000200 time 0.2609 (0.2563) loss 0.3658 (0.3613) grad_norm 210193.6250 (188988.0156) mem 14543MB +[2023-10-11 17:43:14 simmim_pretrain](main_simmim.py 218): INFO Train: [81/200][3000/6787] eta 0:16:13 lr 0.000200 time 0.2609 (0.2569) loss 0.3578 (0.3612) grad_norm 224940.8750 (202727.4219) mem 14543MB +[2023-10-11 17:45:24 simmim_pretrain](main_simmim.py 218): INFO Train: [81/200][3500/6787] eta 0:14:06 lr 0.000200 time 0.2610 (0.2574) loss 0.3432 (0.3611) grad_norm 385088.5000 (219961.5938) mem 14543MB +[2023-10-11 17:47:34 simmim_pretrain](main_simmim.py 218): INFO Train: [81/200][4000/6787] eta 0:11:58 lr 0.000200 time 0.2607 (0.2578) loss 0.3515 (0.3608) grad_norm 248427.4375 (234672.3750) mem 14543MB +[2023-10-11 17:49:44 simmim_pretrain](main_simmim.py 218): INFO Train: [81/200][4500/6787] eta 0:09:50 lr 0.000200 time 0.2610 (0.2581) loss 0.3676 (0.3608) grad_norm 588371.0000 (inf) mem 14543MB +[2023-10-11 17:51:52 simmim_pretrain](main_simmim.py 218): INFO Train: [81/200][5000/6787] eta 0:07:40 lr 0.000200 time 0.2614 (0.2578) loss 0.3413 (0.3606) grad_norm 172051.9375 (inf) mem 14543MB +[2023-10-11 17:54:00 simmim_pretrain](main_simmim.py 218): INFO Train: [81/200][5500/6787] eta 0:05:31 lr 0.000200 time 0.2612 (0.2577) loss 0.3598 (0.3605) grad_norm 481040.5312 (inf) mem 14543MB +[2023-10-11 17:56:08 simmim_pretrain](main_simmim.py 218): INFO Train: [81/200][6000/6787] eta 0:03:22 lr 0.000200 time 0.2521 (0.2576) loss 0.3589 (0.3605) grad_norm 402359.4375 (inf) mem 14543MB +[2023-10-11 17:58:15 simmim_pretrain](main_simmim.py 218): INFO Train: [81/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2574 (0.2573) loss 0.3493 (0.3604) grad_norm 186054.0469 (inf) mem 14543MB +[2023-10-11 17:59:29 simmim_pretrain](main_simmim.py 228): INFO EPOCH 81 training takes 0:29:06 +[2023-10-11 17:59:30 simmim_pretrain](main_simmim.py 218): INFO Train: [82/200][0/6787] eta 2:57:24 lr 0.000200 time 1.5684 (1.5684) loss 0.3604 (0.3604) grad_norm 511990.7500 (511990.7500) mem 14543MB +[2023-10-11 18:01:38 simmim_pretrain](main_simmim.py 218): INFO Train: [82/200][500/6787] eta 0:27:00 lr 0.000200 time 0.2607 (0.2577) loss 0.3459 (0.3604) grad_norm 247524.6094 (inf) mem 14543MB +[2023-10-11 18:03:48 simmim_pretrain](main_simmim.py 218): INFO Train: [82/200][1000/6787] eta 0:24:58 lr 0.000200 time 0.2607 (0.2589) loss 0.3563 (0.3606) grad_norm 138009.5312 (inf) mem 14543MB +[2023-10-11 18:05:58 simmim_pretrain](main_simmim.py 218): INFO Train: [82/200][1500/6787] eta 0:22:50 lr 0.000200 time 0.2604 (0.2593) loss 0.3378 (0.3610) grad_norm 162408.1406 (inf) mem 14543MB +[2023-10-11 18:08:08 simmim_pretrain](main_simmim.py 218): INFO Train: [82/200][2000/6787] eta 0:20:42 lr 0.000200 time 0.2608 (0.2595) loss 0.3687 (0.3619) grad_norm 86026.9688 (inf) mem 14543MB +[2023-10-11 18:10:18 simmim_pretrain](main_simmim.py 218): INFO Train: [82/200][2500/6787] eta 0:18:32 lr 0.000200 time 0.2605 (0.2596) loss 0.3531 (0.3621) grad_norm 142472.1250 (inf) mem 14543MB +[2023-10-11 18:12:28 simmim_pretrain](main_simmim.py 218): INFO Train: [82/200][3000/6787] eta 0:16:23 lr 0.000200 time 0.2611 (0.2597) loss 0.3376 (0.3621) grad_norm 96795.3359 (inf) mem 14543MB +[2023-10-11 18:14:38 simmim_pretrain](main_simmim.py 218): INFO Train: [82/200][3500/6787] eta 0:14:13 lr 0.000200 time 0.2625 (0.2598) loss 0.3624 (0.3623) grad_norm 117470.1875 (inf) mem 14543MB +[2023-10-11 18:16:48 simmim_pretrain](main_simmim.py 218): INFO Train: [82/200][4000/6787] eta 0:12:04 lr 0.000200 time 0.2615 (0.2598) loss 0.3521 (0.3623) grad_norm 137257.2812 (inf) mem 14543MB +[2023-10-11 18:18:58 simmim_pretrain](main_simmim.py 218): INFO Train: [82/200][4500/6787] eta 0:09:54 lr 0.000200 time 0.2612 (0.2599) loss 0.3631 (0.3622) grad_norm 346034.4062 (inf) mem 14543MB +[2023-10-11 18:21:09 simmim_pretrain](main_simmim.py 218): INFO Train: [82/200][5000/6787] eta 0:07:44 lr 0.000200 time 0.2611 (0.2600) loss 0.3672 (0.3621) grad_norm 239760.8906 (inf) mem 14543MB +[2023-10-11 18:23:19 simmim_pretrain](main_simmim.py 218): INFO Train: [82/200][5500/6787] eta 0:05:34 lr 0.000200 time 0.2610 (0.2600) loss 0.3961 (0.3621) grad_norm 140626.7812 (inf) mem 14543MB +[2023-10-11 18:25:29 simmim_pretrain](main_simmim.py 218): INFO Train: [82/200][6000/6787] eta 0:03:24 lr 0.000200 time 0.2611 (0.2600) loss 0.3755 (0.3620) grad_norm 170600.2188 (inf) mem 14543MB +[2023-10-11 18:27:39 simmim_pretrain](main_simmim.py 218): INFO Train: [82/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2602 (0.2601) loss 0.3622 (0.3618) grad_norm 550202.6250 (inf) mem 14543MB +[2023-10-11 18:28:55 simmim_pretrain](main_simmim.py 228): INFO EPOCH 82 training takes 0:29:26 +[2023-10-11 18:28:56 simmim_pretrain](main_simmim.py 218): INFO Train: [83/200][0/6787] eta 2:48:42 lr 0.000200 time 1.4914 (1.4914) loss 0.3392 (0.3392) grad_norm 215142.2188 (215142.2188) mem 14543MB +[2023-10-11 18:31:03 simmim_pretrain](main_simmim.py 218): INFO Train: [83/200][500/6787] eta 0:26:56 lr 0.000200 time 0.2585 (0.2572) loss 0.3776 (0.3601) grad_norm 395189.3750 (380096.2188) mem 14543MB +[2023-10-11 18:33:11 simmim_pretrain](main_simmim.py 218): INFO Train: [83/200][1000/6787] eta 0:24:43 lr 0.000200 time 0.2471 (0.2563) loss 0.3595 (0.3602) grad_norm 392986.3438 (380368.7188) mem 14543MB +[2023-10-11 18:35:18 simmim_pretrain](main_simmim.py 218): INFO Train: [83/200][1500/6787] eta 0:22:31 lr 0.000200 time 0.2552 (0.2557) loss 0.3519 (0.3599) grad_norm 473086.0000 (inf) mem 14543MB +[2023-10-11 18:37:26 simmim_pretrain](main_simmim.py 218): INFO Train: [83/200][2000/6787] eta 0:20:23 lr 0.000200 time 0.2553 (0.2557) loss 0.3864 (0.3597) grad_norm 414694.6250 (inf) mem 14543MB +[2023-10-11 18:39:34 simmim_pretrain](main_simmim.py 218): INFO Train: [83/200][2500/6787] eta 0:18:16 lr 0.000200 time 0.2553 (0.2557) loss 0.3424 (0.3596) grad_norm 778571.1250 (inf) mem 14543MB +[2023-10-11 18:41:42 simmim_pretrain](main_simmim.py 218): INFO Train: [83/200][3000/6787] eta 0:16:08 lr 0.000200 time 0.2551 (0.2558) loss 0.3450 (0.3598) grad_norm 249816.1406 (inf) mem 14543MB +[2023-10-11 18:43:50 simmim_pretrain](main_simmim.py 218): INFO Train: [83/200][3500/6787] eta 0:14:00 lr 0.000200 time 0.2557 (0.2558) loss 0.3611 (0.3599) grad_norm 411463.0000 (inf) mem 14543MB +[2023-10-11 18:45:58 simmim_pretrain](main_simmim.py 218): INFO Train: [83/200][4000/6787] eta 0:11:52 lr 0.000200 time 0.2506 (0.2557) loss 0.3607 (0.3599) grad_norm 333321.7500 (inf) mem 14543MB +[2023-10-11 18:48:05 simmim_pretrain](main_simmim.py 218): INFO Train: [83/200][4500/6787] eta 0:09:44 lr 0.000200 time 0.2550 (0.2555) loss 0.3612 (0.3600) grad_norm 231394.6562 (inf) mem 14543MB +[2023-10-11 18:50:12 simmim_pretrain](main_simmim.py 218): INFO Train: [83/200][5000/6787] eta 0:07:36 lr 0.000200 time 0.2563 (0.2555) loss 0.3775 (0.3602) grad_norm 284874.9375 (inf) mem 14543MB +[2023-10-11 18:52:20 simmim_pretrain](main_simmim.py 218): INFO Train: [83/200][5500/6787] eta 0:05:28 lr 0.000200 time 0.2531 (0.2554) loss 0.3680 (0.3604) grad_norm 291277.3750 (inf) mem 14543MB +[2023-10-11 18:54:27 simmim_pretrain](main_simmim.py 218): INFO Train: [83/200][6000/6787] eta 0:03:21 lr 0.000200 time 0.2527 (0.2554) loss 0.3421 (0.3605) grad_norm 224829.9062 (inf) mem 14543MB +[2023-10-11 18:56:35 simmim_pretrain](main_simmim.py 218): INFO Train: [83/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2537 (0.2554) loss 0.3562 (0.3605) grad_norm 302924.4688 (inf) mem 14543MB +[2023-10-11 18:57:48 simmim_pretrain](main_simmim.py 228): INFO EPOCH 83 training takes 0:28:53 +[2023-10-11 18:57:50 simmim_pretrain](main_simmim.py 218): INFO Train: [84/200][0/6787] eta 3:00:49 lr 0.000200 time 1.5986 (1.5986) loss 0.3934 (0.3934) grad_norm 218183.4844 (218183.4844) mem 14543MB +[2023-10-11 18:59:56 simmim_pretrain](main_simmim.py 218): INFO Train: [84/200][500/6787] eta 0:26:44 lr 0.000200 time 0.2492 (0.2552) loss 0.3576 (0.3597) grad_norm 283566.3750 (inf) mem 14543MB +[2023-10-11 19:02:02 simmim_pretrain](main_simmim.py 218): INFO Train: [84/200][1000/6787] eta 0:24:29 lr 0.000200 time 0.2540 (0.2538) loss 0.3536 (0.3601) grad_norm 286184.2812 (inf) mem 14543MB +[2023-10-11 19:04:09 simmim_pretrain](main_simmim.py 218): INFO Train: [84/200][1500/6787] eta 0:22:18 lr 0.000200 time 0.2571 (0.2532) loss 0.3668 (0.3605) grad_norm 231786.2188 (inf) mem 14543MB +[2023-10-11 19:06:14 simmim_pretrain](main_simmim.py 218): INFO Train: [84/200][2000/6787] eta 0:20:10 lr 0.000200 time 0.2495 (0.2528) loss 0.3863 (0.3608) grad_norm 166839.6875 (inf) mem 14543MB +[2023-10-11 19:08:20 simmim_pretrain](main_simmim.py 218): INFO Train: [84/200][2500/6787] eta 0:18:02 lr 0.000200 time 0.2489 (0.2524) loss 0.3650 (0.3609) grad_norm 162355.9219 (inf) mem 14543MB +[2023-10-11 19:10:25 simmim_pretrain](main_simmim.py 218): INFO Train: [84/200][3000/6787] eta 0:15:54 lr 0.000200 time 0.2509 (0.2521) loss 0.3648 (0.3606) grad_norm 345335.5625 (inf) mem 14543MB +[2023-10-11 19:12:30 simmim_pretrain](main_simmim.py 218): INFO Train: [84/200][3500/6787] eta 0:13:47 lr 0.000200 time 0.2489 (0.2519) loss 0.3722 (0.3606) grad_norm 677765.6250 (inf) mem 14543MB +[2023-10-11 19:14:35 simmim_pretrain](main_simmim.py 218): INFO Train: [84/200][4000/6787] eta 0:11:41 lr 0.000200 time 0.2461 (0.2516) loss 0.3652 (0.3606) grad_norm 385948.4062 (inf) mem 14543MB +[2023-10-11 19:16:40 simmim_pretrain](main_simmim.py 218): INFO Train: [84/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2461 (0.2514) loss 0.3781 (0.3605) grad_norm 368063.0000 (inf) mem 14543MB +[2023-10-11 19:18:45 simmim_pretrain](main_simmim.py 218): INFO Train: [84/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2504 (0.2512) loss 0.3653 (0.3605) grad_norm 210457.3281 (inf) mem 14543MB +[2023-10-11 19:20:50 simmim_pretrain](main_simmim.py 218): INFO Train: [84/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2463 (0.2511) loss 0.3665 (0.3604) grad_norm 213639.8438 (inf) mem 14543MB +[2023-10-11 19:22:55 simmim_pretrain](main_simmim.py 218): INFO Train: [84/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2473 (0.2510) loss 0.3867 (0.3603) grad_norm 284046.3125 (inf) mem 14543MB +[2023-10-11 19:25:00 simmim_pretrain](main_simmim.py 218): INFO Train: [84/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2533 (0.2509) loss 0.3559 (0.3603) grad_norm 346463.3750 (inf) mem 14543MB +[2023-10-11 19:26:12 simmim_pretrain](main_simmim.py 228): INFO EPOCH 84 training takes 0:28:23 +[2023-10-11 19:26:13 simmim_pretrain](main_simmim.py 218): INFO Train: [85/200][0/6787] eta 2:52:16 lr 0.000200 time 1.5229 (1.5229) loss 0.3701 (0.3701) grad_norm 231430.0469 (231430.0469) mem 14543MB +[2023-10-11 19:28:18 simmim_pretrain](main_simmim.py 218): INFO Train: [85/200][500/6787] eta 0:26:27 lr 0.000200 time 0.2488 (0.2526) loss 0.3522 (0.3620) grad_norm 272557.3125 (250176.9062) mem 14543MB +[2023-10-11 19:30:23 simmim_pretrain](main_simmim.py 218): INFO Train: [85/200][1000/6787] eta 0:24:13 lr 0.000200 time 0.2470 (0.2512) loss 0.3777 (0.3619) grad_norm 284231.1250 (244329.8750) mem 14543MB +[2023-10-11 19:32:28 simmim_pretrain](main_simmim.py 218): INFO Train: [85/200][1500/6787] eta 0:22:06 lr 0.000200 time 0.2463 (0.2508) loss 0.3492 (0.3613) grad_norm 226388.4062 (249174.1719) mem 14543MB +[2023-10-11 19:34:34 simmim_pretrain](main_simmim.py 218): INFO Train: [85/200][2000/6787] eta 0:20:00 lr 0.000200 time 0.2501 (0.2507) loss 0.3615 (0.3610) grad_norm 441312.4688 (267987.4688) mem 14543MB +[2023-10-11 19:36:39 simmim_pretrain](main_simmim.py 218): INFO Train: [85/200][2500/6787] eta 0:17:54 lr 0.000200 time 0.2510 (0.2507) loss 0.3541 (0.3609) grad_norm 245185.4375 (inf) mem 14543MB +[2023-10-11 19:38:44 simmim_pretrain](main_simmim.py 218): INFO Train: [85/200][3000/6787] eta 0:15:49 lr 0.000200 time 0.2533 (0.2508) loss 0.3656 (0.3612) grad_norm 243705.8594 (inf) mem 14543MB +[2023-10-11 19:40:50 simmim_pretrain](main_simmim.py 218): INFO Train: [85/200][3500/6787] eta 0:13:44 lr 0.000200 time 0.2491 (0.2508) loss 0.3729 (0.3610) grad_norm 229829.5469 (inf) mem 14543MB +[2023-10-11 19:42:56 simmim_pretrain](main_simmim.py 218): INFO Train: [85/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2560 (0.2509) loss 0.3640 (0.3609) grad_norm 227095.2344 (inf) mem 14543MB +[2023-10-11 19:45:01 simmim_pretrain](main_simmim.py 218): INFO Train: [85/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2498 (0.2509) loss 0.3661 (0.3608) grad_norm 178129.6250 (inf) mem 14543MB +[2023-10-11 19:47:07 simmim_pretrain](main_simmim.py 218): INFO Train: [85/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2462 (0.2509) loss 0.3586 (0.3606) grad_norm 253679.6719 (inf) mem 14543MB +[2023-10-11 19:49:12 simmim_pretrain](main_simmim.py 218): INFO Train: [85/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2463 (0.2509) loss 0.3660 (0.3606) grad_norm 396014.1562 (inf) mem 14543MB +[2023-10-11 19:51:18 simmim_pretrain](main_simmim.py 218): INFO Train: [85/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2516 (0.2509) loss 0.3699 (0.3605) grad_norm 486671.0938 (inf) mem 14543MB +[2023-10-11 19:53:23 simmim_pretrain](main_simmim.py 218): INFO Train: [85/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2506 (0.2509) loss 0.3733 (0.3604) grad_norm 221600.2031 (inf) mem 14543MB +[2023-10-11 19:54:36 simmim_pretrain](main_simmim.py 228): INFO EPOCH 85 training takes 0:28:23 +[2023-10-11 19:54:37 simmim_pretrain](main_simmim.py 218): INFO Train: [86/200][0/6787] eta 2:47:26 lr 0.000200 time 1.4803 (1.4803) loss 0.3559 (0.3559) grad_norm 373023.6562 (373023.6562) mem 14543MB +[2023-10-11 19:56:42 simmim_pretrain](main_simmim.py 218): INFO Train: [86/200][500/6787] eta 0:26:28 lr 0.000200 time 0.2570 (0.2527) loss 0.3705 (0.3608) grad_norm 274661.5312 (263276.8125) mem 14543MB +[2023-10-11 19:58:47 simmim_pretrain](main_simmim.py 218): INFO Train: [86/200][1000/6787] eta 0:24:14 lr 0.000200 time 0.2516 (0.2513) loss 0.3515 (0.3604) grad_norm 292293.8750 (261800.0312) mem 14543MB +[2023-10-11 20:00:52 simmim_pretrain](main_simmim.py 218): INFO Train: [86/200][1500/6787] eta 0:22:05 lr 0.000200 time 0.2536 (0.2508) loss 0.3623 (0.3608) grad_norm 316428.7500 (257527.7188) mem 14543MB +[2023-10-11 20:02:57 simmim_pretrain](main_simmim.py 218): INFO Train: [86/200][2000/6787] eta 0:19:59 lr 0.000200 time 0.2489 (0.2505) loss 0.3681 (0.3609) grad_norm 447697.5938 (269637.5625) mem 14543MB +[2023-10-11 20:05:02 simmim_pretrain](main_simmim.py 218): INFO Train: [86/200][2500/6787] eta 0:17:53 lr 0.000200 time 0.2509 (0.2504) loss 0.3650 (0.3609) grad_norm 243482.3594 (inf) mem 14543MB +[2023-10-11 20:07:07 simmim_pretrain](main_simmim.py 218): INFO Train: [86/200][3000/6787] eta 0:15:48 lr 0.000200 time 0.2478 (0.2503) loss 0.3643 (0.3610) grad_norm 221536.2500 (inf) mem 14543MB +[2023-10-11 20:09:12 simmim_pretrain](main_simmim.py 218): INFO Train: [86/200][3500/6787] eta 0:13:42 lr 0.000200 time 0.2533 (0.2503) loss 0.3673 (0.3610) grad_norm 199322.9219 (inf) mem 14543MB +[2023-10-11 20:11:17 simmim_pretrain](main_simmim.py 218): INFO Train: [86/200][4000/6787] eta 0:11:37 lr 0.000200 time 0.2463 (0.2503) loss 0.3743 (0.3612) grad_norm 195014.1094 (inf) mem 14543MB +[2023-10-11 20:13:22 simmim_pretrain](main_simmim.py 218): INFO Train: [86/200][4500/6787] eta 0:09:32 lr 0.000200 time 0.2511 (0.2502) loss 0.3910 (0.3610) grad_norm 244593.2031 (inf) mem 14543MB +[2023-10-11 20:15:27 simmim_pretrain](main_simmim.py 218): INFO Train: [86/200][5000/6787] eta 0:07:27 lr 0.000200 time 0.2461 (0.2502) loss 0.3568 (0.3609) grad_norm 317116.0000 (inf) mem 14543MB +[2023-10-11 20:17:32 simmim_pretrain](main_simmim.py 218): INFO Train: [86/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2464 (0.2503) loss 0.3654 (0.3609) grad_norm 265173.1250 (inf) mem 14543MB +[2023-10-11 20:19:38 simmim_pretrain](main_simmim.py 218): INFO Train: [86/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2503 (0.2503) loss 0.3632 (0.3610) grad_norm 371299.0625 (inf) mem 14543MB +[2023-10-11 20:21:43 simmim_pretrain](main_simmim.py 218): INFO Train: [86/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2511 (0.2503) loss 0.3729 (0.3609) grad_norm 340506.0625 (inf) mem 14543MB +[2023-10-11 20:22:56 simmim_pretrain](main_simmim.py 228): INFO EPOCH 86 training takes 0:28:20 +[2023-10-11 20:22:57 simmim_pretrain](main_simmim.py 218): INFO Train: [87/200][0/6787] eta 2:54:11 lr 0.000200 time 1.5400 (1.5400) loss 0.3515 (0.3515) grad_norm 261586.7969 (261586.7969) mem 14543MB +[2023-10-11 20:25:02 simmim_pretrain](main_simmim.py 218): INFO Train: [87/200][500/6787] eta 0:26:29 lr 0.000200 time 0.2472 (0.2528) loss 0.3369 (0.3603) grad_norm 282509.0312 (283270.1562) mem 14543MB +[2023-10-11 20:27:08 simmim_pretrain](main_simmim.py 218): INFO Train: [87/200][1000/6787] eta 0:24:17 lr 0.000200 time 0.2496 (0.2519) loss 0.3476 (0.3604) grad_norm 320019.9688 (325104.2500) mem 14543MB +[2023-10-11 20:29:13 simmim_pretrain](main_simmim.py 218): INFO Train: [87/200][1500/6787] eta 0:22:09 lr 0.000200 time 0.2510 (0.2515) loss 0.3502 (0.3604) grad_norm 477013.1875 (350222.0625) mem 14543MB +[2023-10-11 20:31:18 simmim_pretrain](main_simmim.py 218): INFO Train: [87/200][2000/6787] eta 0:20:03 lr 0.000200 time 0.2462 (0.2513) loss 0.3574 (0.3602) grad_norm 289423.2188 (375225.8438) mem 14543MB +[2023-10-11 20:33:24 simmim_pretrain](main_simmim.py 218): INFO Train: [87/200][2500/6787] eta 0:17:57 lr 0.000200 time 0.2538 (0.2513) loss 0.3684 (0.3604) grad_norm 387449.8125 (inf) mem 14543MB +[2023-10-11 20:35:29 simmim_pretrain](main_simmim.py 218): INFO Train: [87/200][3000/6787] eta 0:15:51 lr 0.000200 time 0.2521 (0.2512) loss 0.3710 (0.3604) grad_norm 315114.2188 (inf) mem 14543MB +[2023-10-11 20:37:35 simmim_pretrain](main_simmim.py 218): INFO Train: [87/200][3500/6787] eta 0:13:45 lr 0.000200 time 0.2503 (0.2511) loss 0.3631 (0.3606) grad_norm 272360.9375 (inf) mem 14543MB +[2023-10-11 20:39:40 simmim_pretrain](main_simmim.py 218): INFO Train: [87/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2483 (0.2510) loss 0.3713 (0.3605) grad_norm 260816.7656 (inf) mem 14543MB +[2023-10-11 20:41:45 simmim_pretrain](main_simmim.py 218): INFO Train: [87/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2463 (0.2509) loss 0.3785 (0.3607) grad_norm 167021.7188 (inf) mem 14543MB +[2023-10-11 20:43:50 simmim_pretrain](main_simmim.py 218): INFO Train: [87/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2466 (0.2508) loss 0.3553 (0.3609) grad_norm 387843.6875 (inf) mem 14543MB +[2023-10-11 20:45:55 simmim_pretrain](main_simmim.py 218): INFO Train: [87/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2468 (0.2508) loss 0.3530 (0.3609) grad_norm 240784.5781 (inf) mem 14543MB +[2023-10-11 20:48:00 simmim_pretrain](main_simmim.py 218): INFO Train: [87/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2526 (0.2508) loss 0.3655 (0.3608) grad_norm 350282.4375 (inf) mem 14543MB +[2023-10-11 20:50:06 simmim_pretrain](main_simmim.py 218): INFO Train: [87/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2460 (0.2507) loss 0.3839 (0.3610) grad_norm 172207.4375 (inf) mem 14543MB +[2023-10-11 20:51:18 simmim_pretrain](main_simmim.py 228): INFO EPOCH 87 training takes 0:28:22 +[2023-10-11 20:51:20 simmim_pretrain](main_simmim.py 218): INFO Train: [88/200][0/6787] eta 2:59:01 lr 0.000200 time 1.5827 (1.5827) loss 0.3547 (0.3547) grad_norm 227045.6562 (227045.6562) mem 14543MB +[2023-10-11 20:53:25 simmim_pretrain](main_simmim.py 218): INFO Train: [88/200][500/6787] eta 0:26:28 lr 0.000200 time 0.2552 (0.2527) loss 0.3458 (0.3622) grad_norm 160340.3438 (257354.3281) mem 14543MB +[2023-10-11 20:55:30 simmim_pretrain](main_simmim.py 218): INFO Train: [88/200][1000/6787] eta 0:24:16 lr 0.000200 time 0.2488 (0.2517) loss 0.3636 (0.3614) grad_norm 298845.7812 (259312.7812) mem 14543MB +[2023-10-11 20:57:35 simmim_pretrain](main_simmim.py 218): INFO Train: [88/200][1500/6787] eta 0:22:07 lr 0.000200 time 0.2520 (0.2512) loss 0.4119 (0.3970) grad_norm 56660.4883 (inf) mem 14543MB +[2023-10-11 20:59:40 simmim_pretrain](main_simmim.py 218): INFO Train: [88/200][2000/6787] eta 0:20:01 lr 0.000200 time 0.2495 (0.2511) loss 0.3587 (0.3927) grad_norm 34838.4766 (inf) mem 14543MB +[2023-10-11 21:01:46 simmim_pretrain](main_simmim.py 218): INFO Train: [88/200][2500/6787] eta 0:17:56 lr 0.000200 time 0.2513 (0.2510) loss 0.3521 (0.3880) grad_norm 30103.2676 (inf) mem 14543MB +[2023-10-11 21:03:51 simmim_pretrain](main_simmim.py 218): INFO Train: [88/200][3000/6787] eta 0:15:50 lr 0.000200 time 0.2517 (0.2510) loss 0.3868 (0.3848) grad_norm 24777.1641 (inf) mem 14543MB +[2023-10-11 21:05:57 simmim_pretrain](main_simmim.py 218): INFO Train: [88/200][3500/6787] eta 0:13:45 lr 0.000200 time 0.2518 (0.2510) loss 0.3731 (0.3819) grad_norm 57457.0273 (inf) mem 14543MB +[2023-10-11 21:08:02 simmim_pretrain](main_simmim.py 218): INFO Train: [88/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2495 (0.2510) loss 0.3543 (0.3798) grad_norm 33334.3750 (inf) mem 14543MB +[2023-10-11 21:10:08 simmim_pretrain](main_simmim.py 218): INFO Train: [88/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2524 (0.2510) loss 0.3466 (0.3780) grad_norm 49983.0938 (inf) mem 14543MB +[2023-10-11 21:12:13 simmim_pretrain](main_simmim.py 218): INFO Train: [88/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2505 (0.2510) loss 0.3361 (0.3767) grad_norm 35029.6875 (inf) mem 14543MB +[2023-10-11 21:14:18 simmim_pretrain](main_simmim.py 218): INFO Train: [88/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2463 (0.2509) loss 0.3874 (0.3754) grad_norm 62630.1875 (inf) mem 14543MB +[2023-10-11 21:16:23 simmim_pretrain](main_simmim.py 218): INFO Train: [88/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2557 (0.2508) loss 0.3653 (0.3743) grad_norm 85851.2734 (inf) mem 14543MB +[2023-10-11 21:18:28 simmim_pretrain](main_simmim.py 218): INFO Train: [88/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2517 (0.2508) loss 0.3529 (0.3734) grad_norm 68912.6094 (inf) mem 14543MB +[2023-10-11 21:19:40 simmim_pretrain](main_simmim.py 228): INFO EPOCH 88 training takes 0:28:22 +[2023-10-11 21:19:42 simmim_pretrain](main_simmim.py 218): INFO Train: [89/200][0/6787] eta 2:50:25 lr 0.000200 time 1.5067 (1.5067) loss 0.3623 (0.3623) grad_norm 91225.9297 (91225.9297) mem 14543MB +[2023-10-11 21:21:47 simmim_pretrain](main_simmim.py 218): INFO Train: [89/200][500/6787] eta 0:26:25 lr 0.000200 time 0.2455 (0.2522) loss 0.3705 (0.3607) grad_norm 68872.0234 (99398.5938) mem 14543MB +[2023-10-11 21:23:52 simmim_pretrain](main_simmim.py 218): INFO Train: [89/200][1000/6787] eta 0:24:14 lr 0.000200 time 0.2544 (0.2513) loss 0.3696 (0.3612) grad_norm 98099.3047 (115776.9297) mem 14543MB +[2023-10-11 21:25:57 simmim_pretrain](main_simmim.py 218): INFO Train: [89/200][1500/6787] eta 0:22:07 lr 0.000200 time 0.2497 (0.2510) loss 0.3465 (0.3612) grad_norm 196062.1094 (136350.5938) mem 14543MB +[2023-10-11 21:28:03 simmim_pretrain](main_simmim.py 218): INFO Train: [89/200][2000/6787] eta 0:20:01 lr 0.000200 time 0.2492 (0.2510) loss 0.3600 (0.3610) grad_norm 161352.8125 (143979.5469) mem 14543MB +[2023-10-11 21:30:08 simmim_pretrain](main_simmim.py 218): INFO Train: [89/200][2500/6787] eta 0:17:55 lr 0.000200 time 0.2591 (0.2510) loss 0.3673 (0.3609) grad_norm 251705.7188 (155916.9219) mem 14543MB +[2023-10-11 21:32:14 simmim_pretrain](main_simmim.py 218): INFO Train: [89/200][3000/6787] eta 0:15:50 lr 0.000200 time 0.2528 (0.2510) loss 0.3481 (0.3606) grad_norm 197844.5938 (175195.7656) mem 14543MB +[2023-10-11 21:34:19 simmim_pretrain](main_simmim.py 218): INFO Train: [89/200][3500/6787] eta 0:13:45 lr 0.000200 time 0.2502 (0.2510) loss 0.3600 (0.3604) grad_norm 647859.2500 (195491.0781) mem 14543MB +[2023-10-11 21:36:25 simmim_pretrain](main_simmim.py 218): INFO Train: [89/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2488 (0.2511) loss 0.3541 (0.3603) grad_norm 447311.7812 (229069.8125) mem 14543MB +[2023-10-11 21:38:31 simmim_pretrain](main_simmim.py 218): INFO Train: [89/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2518 (0.2511) loss 0.3644 (0.3602) grad_norm 410143.4375 (inf) mem 14543MB +[2023-10-11 21:40:37 simmim_pretrain](main_simmim.py 218): INFO Train: [89/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2484 (0.2512) loss 0.3442 (0.3601) grad_norm 408016.3438 (inf) mem 14543MB +[2023-10-11 21:42:41 simmim_pretrain](main_simmim.py 218): INFO Train: [89/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2463 (0.2510) loss 0.3622 (0.3600) grad_norm 446225.4688 (inf) mem 14543MB +[2023-10-11 21:44:44 simmim_pretrain](main_simmim.py 218): INFO Train: [89/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2455 (0.2506) loss 0.3564 (0.3599) grad_norm 261182.1094 (inf) mem 14543MB +[2023-10-11 21:46:47 simmim_pretrain](main_simmim.py 218): INFO Train: [89/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2442 (0.2502) loss 0.3668 (0.3598) grad_norm 386898.7188 (inf) mem 14543MB +[2023-10-11 21:47:58 simmim_pretrain](main_simmim.py 228): INFO EPOCH 89 training takes 0:28:17 +[2023-10-11 21:47:59 simmim_pretrain](main_simmim.py 218): INFO Train: [90/200][0/6787] eta 2:22:38 lr 0.000200 time 1.2610 (1.2610) loss 0.3760 (0.3760) grad_norm 170557.5000 (170557.5000) mem 14543MB +[2023-10-11 21:50:01 simmim_pretrain](main_simmim.py 218): INFO Train: [90/200][500/6787] eta 0:25:48 lr 0.000200 time 0.2442 (0.2463) loss 0.3660 (0.3600) grad_norm 168590.0938 (240454.4844) mem 14543MB +[2023-10-11 21:52:03 simmim_pretrain](main_simmim.py 218): INFO Train: [90/200][1000/6787] eta 0:23:38 lr 0.000200 time 0.2438 (0.2452) loss 0.3698 (0.3605) grad_norm 240785.2344 (231088.2344) mem 14543MB +[2023-10-11 21:54:05 simmim_pretrain](main_simmim.py 218): INFO Train: [90/200][1500/6787] eta 0:21:34 lr 0.000200 time 0.2437 (0.2448) loss 0.3605 (0.3607) grad_norm 224320.6719 (225170.1719) mem 14543MB +[2023-10-11 21:56:07 simmim_pretrain](main_simmim.py 218): INFO Train: [90/200][2000/6787] eta 0:19:31 lr 0.000200 time 0.2442 (0.2446) loss 0.3573 (0.3609) grad_norm 482738.2812 (220991.0312) mem 14543MB +[2023-10-11 21:58:09 simmim_pretrain](main_simmim.py 218): INFO Train: [90/200][2500/6787] eta 0:17:28 lr 0.000200 time 0.2442 (0.2445) loss 0.3836 (0.3607) grad_norm 305034.9062 (237426.5625) mem 14543MB +[2023-10-11 22:00:11 simmim_pretrain](main_simmim.py 218): INFO Train: [90/200][3000/6787] eta 0:15:25 lr 0.000200 time 0.2436 (0.2444) loss 0.3755 (0.3606) grad_norm 305304.8750 (254154.1406) mem 14543MB +[2023-10-11 22:02:13 simmim_pretrain](main_simmim.py 218): INFO Train: [90/200][3500/6787] eta 0:13:23 lr 0.000200 time 0.2437 (0.2444) loss 0.3746 (0.3604) grad_norm 227178.0000 (280693.0938) mem 14543MB +[2023-10-11 22:04:15 simmim_pretrain](main_simmim.py 218): INFO Train: [90/200][4000/6787] eta 0:11:20 lr 0.000200 time 0.2437 (0.2443) loss 0.3537 (0.3604) grad_norm 447759.7500 (287622.4688) mem 14543MB +[2023-10-11 22:06:17 simmim_pretrain](main_simmim.py 218): INFO Train: [90/200][4500/6787] eta 0:09:18 lr 0.000200 time 0.2442 (0.2443) loss 0.3510 (0.3602) grad_norm 285604.6250 (inf) mem 14543MB +[2023-10-11 22:08:19 simmim_pretrain](main_simmim.py 218): INFO Train: [90/200][5000/6787] eta 0:07:16 lr 0.000200 time 0.2443 (0.2443) loss 0.3486 (0.3604) grad_norm 331259.1875 (inf) mem 14543MB +[2023-10-11 22:10:21 simmim_pretrain](main_simmim.py 218): INFO Train: [90/200][5500/6787] eta 0:05:14 lr 0.000200 time 0.2439 (0.2442) loss 0.3699 (0.3604) grad_norm 276170.1250 (inf) mem 14543MB +[2023-10-11 22:12:23 simmim_pretrain](main_simmim.py 218): INFO Train: [90/200][6000/6787] eta 0:03:12 lr 0.000200 time 0.2440 (0.2442) loss 0.3365 (0.3603) grad_norm 278261.8750 (inf) mem 14543MB +[2023-10-11 22:14:25 simmim_pretrain](main_simmim.py 218): INFO Train: [90/200][6500/6787] eta 0:01:10 lr 0.000200 time 0.2441 (0.2442) loss 0.3590 (0.3604) grad_norm 334744.2812 (inf) mem 14543MB +[2023-10-11 22:15:36 simmim_pretrain](main_simmim.py 228): INFO EPOCH 90 training takes 0:27:38 +[2023-10-11 22:15:37 simmim_pretrain](main_simmim.py 218): INFO Train: [91/200][0/6787] eta 2:23:35 lr 0.000200 time 1.2693 (1.2693) loss 0.3623 (0.3623) grad_norm 191644.7969 (191644.7969) mem 14543MB +[2023-10-11 22:17:39 simmim_pretrain](main_simmim.py 218): INFO Train: [91/200][500/6787] eta 0:25:47 lr 0.000200 time 0.2442 (0.2462) loss 0.3624 (0.3594) grad_norm 487173.9375 (310834.7812) mem 14543MB +[2023-10-11 22:19:41 simmim_pretrain](main_simmim.py 218): INFO Train: [91/200][1000/6787] eta 0:23:38 lr 0.000200 time 0.2443 (0.2451) loss 0.3755 (0.3593) grad_norm 414266.3750 (338198.0625) mem 14543MB +[2023-10-11 22:21:43 simmim_pretrain](main_simmim.py 218): INFO Train: [91/200][1500/6787] eta 0:21:34 lr 0.000200 time 0.2440 (0.2448) loss 0.3474 (0.3596) grad_norm 473233.6562 (340883.9062) mem 14543MB +[2023-10-11 22:23:45 simmim_pretrain](main_simmim.py 218): INFO Train: [91/200][2000/6787] eta 0:19:30 lr 0.000200 time 0.2442 (0.2446) loss 0.3799 (0.3592) grad_norm 416249.3750 (inf) mem 14543MB +[2023-10-11 22:25:47 simmim_pretrain](main_simmim.py 218): INFO Train: [91/200][2500/6787] eta 0:17:28 lr 0.000200 time 0.2443 (0.2445) loss 0.3519 (0.3591) grad_norm 516312.1562 (inf) mem 14543MB +[2023-10-11 22:27:49 simmim_pretrain](main_simmim.py 218): INFO Train: [91/200][3000/6787] eta 0:15:25 lr 0.000200 time 0.2444 (0.2444) loss 0.3723 (0.3593) grad_norm 633295.6875 (inf) mem 14543MB +[2023-10-11 22:29:52 simmim_pretrain](main_simmim.py 218): INFO Train: [91/200][3500/6787] eta 0:13:23 lr 0.000200 time 0.2441 (0.2444) loss 0.3532 (0.3594) grad_norm 445285.3125 (inf) mem 14543MB +[2023-10-11 22:31:54 simmim_pretrain](main_simmim.py 218): INFO Train: [91/200][4000/6787] eta 0:11:20 lr 0.000200 time 0.2438 (0.2443) loss 0.3451 (0.3594) grad_norm 415215.3438 (inf) mem 14543MB +[2023-10-11 22:33:56 simmim_pretrain](main_simmim.py 218): INFO Train: [91/200][4500/6787] eta 0:09:18 lr 0.000200 time 0.2444 (0.2443) loss 0.3935 (0.3597) grad_norm 307076.5625 (inf) mem 14543MB +[2023-10-11 22:35:58 simmim_pretrain](main_simmim.py 218): INFO Train: [91/200][5000/6787] eta 0:07:16 lr 0.000200 time 0.2442 (0.2443) loss 0.3607 (0.3598) grad_norm 282594.4375 (inf) mem 14543MB +[2023-10-11 22:38:00 simmim_pretrain](main_simmim.py 218): INFO Train: [91/200][5500/6787] eta 0:05:14 lr 0.000200 time 0.2437 (0.2442) loss 0.3528 (0.3600) grad_norm 171084.9219 (inf) mem 14543MB +[2023-10-11 22:40:02 simmim_pretrain](main_simmim.py 218): INFO Train: [91/200][6000/6787] eta 0:03:12 lr 0.000200 time 0.2445 (0.2442) loss 0.3498 (0.3601) grad_norm 258422.8750 (inf) mem 14543MB +[2023-10-11 22:42:04 simmim_pretrain](main_simmim.py 218): INFO Train: [91/200][6500/6787] eta 0:01:10 lr 0.000200 time 0.2439 (0.2442) loss 0.3478 (0.3601) grad_norm 315594.7500 (inf) mem 14543MB +[2023-10-11 22:43:14 simmim_pretrain](main_simmim.py 228): INFO EPOCH 91 training takes 0:27:38 +[2023-10-11 22:43:15 simmim_pretrain](main_simmim.py 218): INFO Train: [92/200][0/6787] eta 2:23:13 lr 0.000200 time 1.2662 (1.2662) loss 0.3728 (0.3728) grad_norm 404795.1562 (404795.1562) mem 14543MB +[2023-10-11 22:45:18 simmim_pretrain](main_simmim.py 218): INFO Train: [92/200][500/6787] eta 0:25:47 lr 0.000200 time 0.2442 (0.2462) loss 0.3567 (0.3595) grad_norm 295807.7500 (349724.4688) mem 14543MB +[2023-10-11 22:47:20 simmim_pretrain](main_simmim.py 218): INFO Train: [92/200][1000/6787] eta 0:23:38 lr 0.000200 time 0.2437 (0.2451) loss 0.3405 (0.3593) grad_norm 251234.4375 (364011.5000) mem 14543MB +[2023-10-11 22:49:22 simmim_pretrain](main_simmim.py 218): INFO Train: [92/200][1500/6787] eta 0:21:33 lr 0.000200 time 0.2441 (0.2447) loss 0.3476 (0.3590) grad_norm 234071.4375 (inf) mem 14543MB +[2023-10-11 22:51:24 simmim_pretrain](main_simmim.py 218): INFO Train: [92/200][2000/6787] eta 0:19:30 lr 0.000200 time 0.2444 (0.2446) loss 0.3648 (0.3591) grad_norm 249882.7656 (inf) mem 14543MB +[2023-10-11 22:53:26 simmim_pretrain](main_simmim.py 218): INFO Train: [92/200][2500/6787] eta 0:17:28 lr 0.000200 time 0.2438 (0.2445) loss 0.3571 (0.3595) grad_norm 260261.1719 (inf) mem 14543MB +[2023-10-11 22:55:28 simmim_pretrain](main_simmim.py 218): INFO Train: [92/200][3000/6787] eta 0:15:25 lr 0.000200 time 0.2444 (0.2444) loss 0.3584 (0.3596) grad_norm 249233.6406 (inf) mem 14543MB +[2023-10-11 22:57:30 simmim_pretrain](main_simmim.py 218): INFO Train: [92/200][3500/6787] eta 0:13:23 lr 0.000200 time 0.2437 (0.2443) loss 0.3658 (0.3597) grad_norm 335225.8750 (inf) mem 14543MB +[2023-10-11 22:59:32 simmim_pretrain](main_simmim.py 218): INFO Train: [92/200][4000/6787] eta 0:11:20 lr 0.000200 time 0.2437 (0.2443) loss 0.3524 (0.3596) grad_norm 342623.4375 (inf) mem 14543MB +[2023-10-11 23:01:34 simmim_pretrain](main_simmim.py 218): INFO Train: [92/200][4500/6787] eta 0:09:18 lr 0.000200 time 0.2437 (0.2443) loss 0.3663 (0.3595) grad_norm 245564.4375 (inf) mem 14543MB +[2023-10-11 23:03:36 simmim_pretrain](main_simmim.py 218): INFO Train: [92/200][5000/6787] eta 0:07:16 lr 0.000200 time 0.2437 (0.2442) loss 0.3655 (0.3595) grad_norm 225061.4688 (inf) mem 14543MB +[2023-10-11 23:05:38 simmim_pretrain](main_simmim.py 218): INFO Train: [92/200][5500/6787] eta 0:05:14 lr 0.000200 time 0.2439 (0.2442) loss 0.3683 (0.3598) grad_norm 179358.0625 (inf) mem 14543MB +[2023-10-11 23:07:40 simmim_pretrain](main_simmim.py 218): INFO Train: [92/200][6000/6787] eta 0:03:12 lr 0.000200 time 0.2442 (0.2442) loss 0.3419 (0.3601) grad_norm 158925.0312 (inf) mem 14543MB +[2023-10-11 23:09:42 simmim_pretrain](main_simmim.py 218): INFO Train: [92/200][6500/6787] eta 0:01:10 lr 0.000200 time 0.2441 (0.2442) loss 0.3405 (0.3604) grad_norm 106705.7812 (inf) mem 14543MB +[2023-10-11 23:10:52 simmim_pretrain](main_simmim.py 228): INFO EPOCH 92 training takes 0:27:37 +[2023-10-11 23:10:53 simmim_pretrain](main_simmim.py 218): INFO Train: [93/200][0/6787] eta 2:25:57 lr 0.000200 time 1.2904 (1.2904) loss 0.3629 (0.3629) grad_norm 127502.2109 (127502.2109) mem 14543MB +[2023-10-11 23:12:55 simmim_pretrain](main_simmim.py 218): INFO Train: [93/200][500/6787] eta 0:25:47 lr 0.000200 time 0.2440 (0.2461) loss 0.3568 (0.3615) grad_norm 104398.9219 (139989.8125) mem 14543MB +[2023-10-11 23:14:58 simmim_pretrain](main_simmim.py 218): INFO Train: [93/200][1000/6787] eta 0:23:38 lr 0.000200 time 0.2438 (0.2451) loss 0.3605 (0.3616) grad_norm 172516.2344 (156020.9219) mem 14543MB +[2023-10-11 23:17:00 simmim_pretrain](main_simmim.py 218): INFO Train: [93/200][1500/6787] eta 0:21:35 lr 0.000200 time 0.2511 (0.2451) loss 0.3330 (0.3615) grad_norm 134507.4219 (169800.7969) mem 14543MB +[2023-10-11 23:19:04 simmim_pretrain](main_simmim.py 218): INFO Train: [93/200][2000/6787] eta 0:19:35 lr 0.000200 time 0.2500 (0.2457) loss 0.3781 (0.3612) grad_norm 489067.6562 (177929.1562) mem 14543MB +[2023-10-11 23:21:09 simmim_pretrain](main_simmim.py 218): INFO Train: [93/200][2500/6787] eta 0:17:36 lr 0.000200 time 0.2508 (0.2465) loss 0.3498 (0.3610) grad_norm 274301.0625 (192479.8750) mem 14543MB +[2023-10-11 23:23:14 simmim_pretrain](main_simmim.py 218): INFO Train: [93/200][3000/6787] eta 0:15:35 lr 0.000200 time 0.2497 (0.2471) loss 0.3493 (0.3605) grad_norm 194772.6875 (209354.0156) mem 14543MB +[2023-10-11 23:25:19 simmim_pretrain](main_simmim.py 218): INFO Train: [93/200][3500/6787] eta 0:13:33 lr 0.000200 time 0.2512 (0.2476) loss 0.3668 (0.3604) grad_norm 350475.1875 (231430.6406) mem 14543MB +[2023-10-11 23:27:25 simmim_pretrain](main_simmim.py 218): INFO Train: [93/200][4000/6787] eta 0:11:31 lr 0.000200 time 0.2496 (0.2480) loss 0.3527 (0.3601) grad_norm 927230.5625 (250409.8125) mem 14543MB +[2023-10-11 23:29:30 simmim_pretrain](main_simmim.py 218): INFO Train: [93/200][4500/6787] eta 0:09:27 lr 0.000200 time 0.2495 (0.2483) loss 0.3624 (0.3599) grad_norm 475389.3438 (inf) mem 14543MB +[2023-10-11 23:31:36 simmim_pretrain](main_simmim.py 218): INFO Train: [93/200][5000/6787] eta 0:07:24 lr 0.000200 time 0.2547 (0.2487) loss 0.3646 (0.3599) grad_norm 502091.1875 (inf) mem 14543MB +[2023-10-11 23:33:42 simmim_pretrain](main_simmim.py 218): INFO Train: [93/200][5500/6787] eta 0:05:20 lr 0.000200 time 0.2567 (0.2491) loss 0.3861 (0.3598) grad_norm 338733.1875 (inf) mem 14543MB +[2023-10-11 23:35:50 simmim_pretrain](main_simmim.py 218): INFO Train: [93/200][6000/6787] eta 0:03:16 lr 0.000200 time 0.2562 (0.2497) loss 0.3481 (0.3598) grad_norm 450744.2500 (inf) mem 14543MB +[2023-10-11 23:37:59 simmim_pretrain](main_simmim.py 218): INFO Train: [93/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2571 (0.2502) loss 0.3701 (0.3599) grad_norm 252458.0781 (inf) mem 14543MB +[2023-10-11 23:39:13 simmim_pretrain](main_simmim.py 228): INFO EPOCH 93 training takes 0:28:20 +[2023-10-11 23:39:14 simmim_pretrain](main_simmim.py 218): INFO Train: [94/200][0/6787] eta 2:33:10 lr 0.000200 time 1.3542 (1.3542) loss 0.3598 (0.3598) grad_norm 195948.2500 (195948.2500) mem 14543MB +[2023-10-11 23:41:19 simmim_pretrain](main_simmim.py 218): INFO Train: [94/200][500/6787] eta 0:26:30 lr 0.000200 time 0.2515 (0.2530) loss 0.3767 (0.3613) grad_norm 239986.7188 (278015.4688) mem 14543MB +[2023-10-11 23:43:25 simmim_pretrain](main_simmim.py 218): INFO Train: [94/200][1000/6787] eta 0:24:20 lr 0.000200 time 0.2534 (0.2523) loss 0.3650 (0.3616) grad_norm 243425.3594 (254542.3594) mem 14543MB +[2023-10-11 23:45:31 simmim_pretrain](main_simmim.py 218): INFO Train: [94/200][1500/6787] eta 0:22:12 lr 0.000200 time 0.2472 (0.2521) loss 0.3584 (0.3610) grad_norm 338258.1250 (259721.9219) mem 14543MB +[2023-10-11 23:47:37 simmim_pretrain](main_simmim.py 218): INFO Train: [94/200][2000/6787] eta 0:20:05 lr 0.000200 time 0.2514 (0.2519) loss 0.3427 (0.3605) grad_norm 242770.8281 (276487.8438) mem 14543MB +[2023-10-11 23:49:42 simmim_pretrain](main_simmim.py 218): INFO Train: [94/200][2500/6787] eta 0:17:59 lr 0.000200 time 0.2465 (0.2517) loss 0.3694 (0.3604) grad_norm 280177.0000 (inf) mem 14543MB +[2023-10-11 23:51:48 simmim_pretrain](main_simmim.py 218): INFO Train: [94/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2500 (0.2516) loss 0.3510 (0.3604) grad_norm 322366.2188 (inf) mem 14543MB +[2023-10-11 23:53:53 simmim_pretrain](main_simmim.py 218): INFO Train: [94/200][3500/6787] eta 0:13:46 lr 0.000200 time 0.2501 (0.2515) loss 0.3643 (0.3605) grad_norm 267451.7500 (inf) mem 14543MB +[2023-10-11 23:55:59 simmim_pretrain](main_simmim.py 218): INFO Train: [94/200][4000/6787] eta 0:11:40 lr 0.000200 time 0.2521 (0.2514) loss 0.3388 (0.3605) grad_norm 349595.1250 (inf) mem 14543MB +[2023-10-11 23:58:04 simmim_pretrain](main_simmim.py 218): INFO Train: [94/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2538 (0.2514) loss 0.3665 (0.3605) grad_norm 487705.5938 (inf) mem 14543MB +[2023-10-12 00:00:10 simmim_pretrain](main_simmim.py 218): INFO Train: [94/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2485 (0.2513) loss 0.3733 (0.3604) grad_norm 241284.5156 (inf) mem 14543MB +[2023-10-12 00:02:15 simmim_pretrain](main_simmim.py 218): INFO Train: [94/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2593 (0.2513) loss 0.3442 (0.3603) grad_norm 681812.5625 (inf) mem 14543MB +[2023-10-12 00:04:21 simmim_pretrain](main_simmim.py 218): INFO Train: [94/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2503 (0.2513) loss 0.3487 (0.3602) grad_norm 261522.1719 (inf) mem 14543MB +[2023-10-12 00:06:26 simmim_pretrain](main_simmim.py 218): INFO Train: [94/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2518 (0.2512) loss 0.3639 (0.3602) grad_norm 314174.4688 (inf) mem 14543MB +[2023-10-12 00:07:38 simmim_pretrain](main_simmim.py 228): INFO EPOCH 94 training takes 0:28:25 +[2023-10-12 00:07:40 simmim_pretrain](main_simmim.py 218): INFO Train: [95/200][0/6787] eta 3:01:26 lr 0.000200 time 1.6040 (1.6040) loss 0.3612 (0.3612) grad_norm 330066.4688 (330066.4688) mem 14543MB +[2023-10-12 00:09:45 simmim_pretrain](main_simmim.py 218): INFO Train: [95/200][500/6787] eta 0:26:30 lr 0.000200 time 0.2519 (0.2530) loss 0.3535 (0.3590) grad_norm 338333.1562 (inf) mem 14543MB +[2023-10-12 00:11:51 simmim_pretrain](main_simmim.py 218): INFO Train: [95/200][1000/6787] eta 0:24:17 lr 0.000200 time 0.2521 (0.2519) loss 0.3642 (0.3598) grad_norm 342719.0938 (inf) mem 14543MB +[2023-10-12 00:13:56 simmim_pretrain](main_simmim.py 218): INFO Train: [95/200][1500/6787] eta 0:22:10 lr 0.000200 time 0.2489 (0.2516) loss 0.3549 (0.3603) grad_norm 198139.5781 (inf) mem 14543MB +[2023-10-12 00:16:02 simmim_pretrain](main_simmim.py 218): INFO Train: [95/200][2000/6787] eta 0:20:03 lr 0.000200 time 0.2553 (0.2514) loss 0.3507 (0.3603) grad_norm 303251.9062 (inf) mem 14543MB +[2023-10-12 00:18:07 simmim_pretrain](main_simmim.py 218): INFO Train: [95/200][2500/6787] eta 0:17:57 lr 0.000200 time 0.2485 (0.2513) loss 0.3645 (0.3604) grad_norm 269514.3125 (inf) mem 14543MB +[2023-10-12 00:20:12 simmim_pretrain](main_simmim.py 218): INFO Train: [95/200][3000/6787] eta 0:15:51 lr 0.000200 time 0.2492 (0.2512) loss 0.3571 (0.3603) grad_norm 459835.5938 (inf) mem 14543MB +[2023-10-12 00:22:18 simmim_pretrain](main_simmim.py 218): INFO Train: [95/200][3500/6787] eta 0:13:45 lr 0.000200 time 0.2531 (0.2512) loss 0.3537 (0.3601) grad_norm 290855.5625 (inf) mem 14543MB +[2023-10-12 00:24:23 simmim_pretrain](main_simmim.py 218): INFO Train: [95/200][4000/6787] eta 0:11:40 lr 0.000200 time 0.2593 (0.2512) loss 0.3527 (0.3600) grad_norm 662410.2500 (inf) mem 14543MB +[2023-10-12 00:26:29 simmim_pretrain](main_simmim.py 218): INFO Train: [95/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2458 (0.2512) loss 0.3524 (0.3602) grad_norm 312092.1250 (inf) mem 14543MB +[2023-10-12 00:28:34 simmim_pretrain](main_simmim.py 218): INFO Train: [95/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2584 (0.2511) loss 0.3704 (0.3602) grad_norm 263137.7812 (inf) mem 14543MB +[2023-10-12 00:30:40 simmim_pretrain](main_simmim.py 218): INFO Train: [95/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2488 (0.2511) loss 0.3380 (0.3602) grad_norm 217898.2188 (inf) mem 14543MB +[2023-10-12 00:32:45 simmim_pretrain](main_simmim.py 218): INFO Train: [95/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2481 (0.2511) loss 0.3524 (0.3602) grad_norm 187770.5312 (inf) mem 14543MB +[2023-10-12 00:34:51 simmim_pretrain](main_simmim.py 218): INFO Train: [95/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2498 (0.2511) loss 0.3603 (0.3602) grad_norm 239238.5156 (nan) mem 14543MB +[2023-10-12 00:36:03 simmim_pretrain](main_simmim.py 228): INFO EPOCH 95 training takes 0:28:24 +[2023-10-12 00:36:05 simmim_pretrain](main_simmim.py 218): INFO Train: [96/200][0/6787] eta 2:35:28 lr 0.000200 time 1.3745 (1.3745) loss 0.3515 (0.3515) grad_norm 338666.8125 (338666.8125) mem 14543MB +[2023-10-12 00:38:10 simmim_pretrain](main_simmim.py 218): INFO Train: [96/200][500/6787] eta 0:26:31 lr 0.000200 time 0.2483 (0.2532) loss 0.3654 (0.3613) grad_norm 269618.0312 (263035.4062) mem 14543MB +[2023-10-12 00:40:16 simmim_pretrain](main_simmim.py 218): INFO Train: [96/200][1000/6787] eta 0:24:19 lr 0.000200 time 0.2553 (0.2522) loss 0.3493 (0.3618) grad_norm 159350.4844 (inf) mem 14543MB +[2023-10-12 00:42:21 simmim_pretrain](main_simmim.py 218): INFO Train: [96/200][1500/6787] eta 0:22:11 lr 0.000200 time 0.2522 (0.2519) loss 0.3694 (0.3623) grad_norm 143001.4844 (inf) mem 14543MB +[2023-10-12 00:44:27 simmim_pretrain](main_simmim.py 218): INFO Train: [96/200][2000/6787] eta 0:20:05 lr 0.000200 time 0.2546 (0.2518) loss 0.3682 (0.3624) grad_norm 126866.6172 (inf) mem 14543MB +[2023-10-12 00:46:33 simmim_pretrain](main_simmim.py 218): INFO Train: [96/200][2500/6787] eta 0:17:59 lr 0.000200 time 0.2510 (0.2518) loss 0.3668 (0.3626) grad_norm 86701.1094 (inf) mem 14543MB +[2023-10-12 00:48:39 simmim_pretrain](main_simmim.py 218): INFO Train: [96/200][3000/6787] eta 0:15:53 lr 0.000200 time 0.2472 (0.2518) loss 0.3802 (0.3624) grad_norm 99419.3984 (inf) mem 14543MB +[2023-10-12 00:50:45 simmim_pretrain](main_simmim.py 218): INFO Train: [96/200][3500/6787] eta 0:13:47 lr 0.000200 time 0.2505 (0.2518) loss 0.3649 (0.3621) grad_norm 203786.1250 (inf) mem 14543MB +[2023-10-12 00:52:51 simmim_pretrain](main_simmim.py 218): INFO Train: [96/200][4000/6787] eta 0:11:41 lr 0.000200 time 0.2538 (0.2518) loss 0.3304 (0.3620) grad_norm 193049.5000 (inf) mem 14543MB +[2023-10-12 00:54:57 simmim_pretrain](main_simmim.py 218): INFO Train: [96/200][4500/6787] eta 0:09:35 lr 0.000200 time 0.2587 (0.2518) loss 0.3694 (0.3618) grad_norm 285247.9688 (inf) mem 14543MB +[2023-10-12 00:57:02 simmim_pretrain](main_simmim.py 218): INFO Train: [96/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2504 (0.2518) loss 0.3561 (0.3617) grad_norm 334870.4375 (inf) mem 14543MB +[2023-10-12 00:59:08 simmim_pretrain](main_simmim.py 218): INFO Train: [96/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2472 (0.2517) loss 0.3654 (0.3615) grad_norm 336901.1250 (inf) mem 14543MB +[2023-10-12 01:01:14 simmim_pretrain](main_simmim.py 218): INFO Train: [96/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2558 (0.2517) loss 0.3631 (0.3614) grad_norm 312311.0938 (inf) mem 14543MB +[2023-10-12 01:03:20 simmim_pretrain](main_simmim.py 218): INFO Train: [96/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2701 (0.2517) loss 0.3649 (0.3612) grad_norm 531862.7500 (inf) mem 14543MB +[2023-10-12 01:04:32 simmim_pretrain](main_simmim.py 228): INFO EPOCH 96 training takes 0:28:28 +[2023-10-12 01:04:34 simmim_pretrain](main_simmim.py 218): INFO Train: [97/200][0/6787] eta 2:53:17 lr 0.000200 time 1.5319 (1.5319) loss 0.3422 (0.3422) grad_norm 364519.9062 (364519.9062) mem 14543MB +[2023-10-12 01:06:39 simmim_pretrain](main_simmim.py 218): INFO Train: [97/200][500/6787] eta 0:26:33 lr 0.000200 time 0.2500 (0.2535) loss 0.3498 (0.3588) grad_norm 413153.1562 (inf) mem 14543MB +[2023-10-12 01:08:45 simmim_pretrain](main_simmim.py 218): INFO Train: [97/200][1000/6787] eta 0:24:20 lr 0.000200 time 0.2476 (0.2524) loss 0.3515 (0.3593) grad_norm 416699.2812 (inf) mem 14543MB +[2023-10-12 01:10:50 simmim_pretrain](main_simmim.py 218): INFO Train: [97/200][1500/6787] eta 0:22:12 lr 0.000200 time 0.2503 (0.2521) loss 0.3595 (0.3601) grad_norm 188335.2656 (inf) mem 14543MB +[2023-10-12 01:12:56 simmim_pretrain](main_simmim.py 218): INFO Train: [97/200][2000/6787] eta 0:20:05 lr 0.000200 time 0.2513 (0.2518) loss 0.3605 (0.3608) grad_norm 159172.7031 (inf) mem 14543MB +[2023-10-12 01:15:01 simmim_pretrain](main_simmim.py 218): INFO Train: [97/200][2500/6787] eta 0:17:58 lr 0.000200 time 0.2498 (0.2516) loss 0.3568 (0.3612) grad_norm 163809.4844 (inf) mem 14543MB +[2023-10-12 01:17:07 simmim_pretrain](main_simmim.py 218): INFO Train: [97/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2538 (0.2515) loss 0.3545 (0.3615) grad_norm 129363.6953 (inf) mem 14543MB +[2023-10-12 01:19:12 simmim_pretrain](main_simmim.py 218): INFO Train: [97/200][3500/6787] eta 0:13:46 lr 0.000200 time 0.2490 (0.2514) loss 0.3679 (0.3617) grad_norm 84584.1016 (inf) mem 14543MB +[2023-10-12 01:21:18 simmim_pretrain](main_simmim.py 218): INFO Train: [97/200][4000/6787] eta 0:11:40 lr 0.000200 time 0.2564 (0.2513) loss 0.3545 (0.3617) grad_norm 236204.7031 (inf) mem 14543MB +[2023-10-12 01:23:23 simmim_pretrain](main_simmim.py 218): INFO Train: [97/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2520 (0.2513) loss 0.3555 (0.3616) grad_norm 91257.7031 (inf) mem 14543MB +[2023-10-12 01:25:28 simmim_pretrain](main_simmim.py 218): INFO Train: [97/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2459 (0.2512) loss 0.3821 (0.3614) grad_norm 192299.5156 (inf) mem 14543MB +[2023-10-12 01:27:34 simmim_pretrain](main_simmim.py 218): INFO Train: [97/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2520 (0.2512) loss 0.3765 (0.3613) grad_norm 231435.8125 (inf) mem 14543MB +[2023-10-12 01:29:40 simmim_pretrain](main_simmim.py 218): INFO Train: [97/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2518 (0.2512) loss 0.3583 (0.3612) grad_norm 230838.4219 (inf) mem 14543MB +[2023-10-12 01:31:46 simmim_pretrain](main_simmim.py 218): INFO Train: [97/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2509 (0.2513) loss 0.3828 (0.3611) grad_norm 249152.5312 (inf) mem 14543MB +[2023-10-12 01:32:59 simmim_pretrain](main_simmim.py 228): INFO EPOCH 97 training takes 0:28:26 +[2023-10-12 01:33:00 simmim_pretrain](main_simmim.py 218): INFO Train: [98/200][0/6787] eta 2:45:34 lr 0.000200 time 1.4638 (1.4638) loss 0.3622 (0.3622) grad_norm 291276.5938 (291276.5938) mem 14543MB +[2023-10-12 01:35:06 simmim_pretrain](main_simmim.py 218): INFO Train: [98/200][500/6787] eta 0:26:32 lr 0.000200 time 0.2512 (0.2534) loss 0.3479 (0.3611) grad_norm 338077.3750 (237961.6562) mem 14543MB +[2023-10-12 01:37:12 simmim_pretrain](main_simmim.py 218): INFO Train: [98/200][1000/6787] eta 0:24:22 lr 0.000200 time 0.2493 (0.2527) loss 0.3441 (0.3609) grad_norm 278081.3750 (243458.1406) mem 14543MB +[2023-10-12 01:39:18 simmim_pretrain](main_simmim.py 218): INFO Train: [98/200][1500/6787] eta 0:22:15 lr 0.000200 time 0.2590 (0.2526) loss 0.3558 (0.3603) grad_norm 653827.8125 (276654.0625) mem 14543MB +[2023-10-12 01:41:24 simmim_pretrain](main_simmim.py 218): INFO Train: [98/200][2000/6787] eta 0:20:08 lr 0.000200 time 0.2488 (0.2525) loss 0.3495 (0.3601) grad_norm 178403.3125 (inf) mem 14543MB +[2023-10-12 01:43:30 simmim_pretrain](main_simmim.py 218): INFO Train: [98/200][2500/6787] eta 0:18:02 lr 0.000200 time 0.2530 (0.2525) loss 0.3604 (0.3601) grad_norm 339051.9375 (inf) mem 14543MB +[2023-10-12 01:45:37 simmim_pretrain](main_simmim.py 218): INFO Train: [98/200][3000/6787] eta 0:15:56 lr 0.000200 time 0.2593 (0.2525) loss 0.3737 (0.3600) grad_norm 275548.2500 (inf) mem 14543MB +[2023-10-12 01:47:43 simmim_pretrain](main_simmim.py 218): INFO Train: [98/200][3500/6787] eta 0:13:50 lr 0.000200 time 0.2503 (0.2526) loss 0.3631 (0.3600) grad_norm 264094.0625 (inf) mem 14543MB +[2023-10-12 01:49:49 simmim_pretrain](main_simmim.py 218): INFO Train: [98/200][4000/6787] eta 0:11:43 lr 0.000200 time 0.2482 (0.2525) loss 0.3779 (0.3600) grad_norm 299249.4062 (inf) mem 14543MB +[2023-10-12 01:51:55 simmim_pretrain](main_simmim.py 218): INFO Train: [98/200][4500/6787] eta 0:09:37 lr 0.000200 time 0.2539 (0.2525) loss 0.3538 (0.3602) grad_norm 153941.1875 (nan) mem 14543MB +[2023-10-12 01:54:03 simmim_pretrain](main_simmim.py 218): INFO Train: [98/200][5000/6787] eta 0:07:31 lr 0.000200 time 0.2573 (0.2529) loss 0.3554 (0.3605) grad_norm 181709.2031 (nan) mem 14543MB +[2023-10-12 01:56:12 simmim_pretrain](main_simmim.py 218): INFO Train: [98/200][5500/6787] eta 0:05:25 lr 0.000200 time 0.2581 (0.2533) loss 0.3563 (0.3608) grad_norm 181319.8750 (nan) mem 14543MB +[2023-10-12 01:58:21 simmim_pretrain](main_simmim.py 218): INFO Train: [98/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2573 (0.2536) loss 0.3728 (0.3612) grad_norm 75110.2344 (nan) mem 14543MB +[2023-10-12 02:00:30 simmim_pretrain](main_simmim.py 218): INFO Train: [98/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2578 (0.2539) loss 0.3759 (0.3617) grad_norm 73465.4375 (nan) mem 14543MB +[2023-10-12 02:01:44 simmim_pretrain](main_simmim.py 228): INFO EPOCH 98 training takes 0:28:45 +[2023-10-12 02:01:45 simmim_pretrain](main_simmim.py 218): INFO Train: [99/200][0/6787] eta 2:55:50 lr 0.000200 time 1.5545 (1.5545) loss 0.3537 (0.3537) grad_norm 65805.4844 (65805.4844) mem 14543MB +[2023-10-12 02:03:51 simmim_pretrain](main_simmim.py 218): INFO Train: [99/200][500/6787] eta 0:26:31 lr 0.000200 time 0.2534 (0.2531) loss 0.3779 (0.3640) grad_norm 57297.2031 (64116.5977) mem 14543MB +[2023-10-12 02:05:57 simmim_pretrain](main_simmim.py 218): INFO Train: [99/200][1000/6787] eta 0:24:20 lr 0.000200 time 0.2533 (0.2524) loss 0.3790 (0.3651) grad_norm 70941.5391 (64241.7031) mem 14543MB +[2023-10-12 02:08:02 simmim_pretrain](main_simmim.py 218): INFO Train: [99/200][1500/6787] eta 0:22:13 lr 0.000200 time 0.2526 (0.2522) loss 0.3404 (0.3641) grad_norm 89075.2422 (69660.6641) mem 14543MB +[2023-10-12 02:10:08 simmim_pretrain](main_simmim.py 218): INFO Train: [99/200][2000/6787] eta 0:20:06 lr 0.000200 time 0.2496 (0.2521) loss 0.3739 (0.3636) grad_norm 75972.2344 (76073.1953) mem 14543MB +[2023-10-12 02:12:14 simmim_pretrain](main_simmim.py 218): INFO Train: [99/200][2500/6787] eta 0:18:00 lr 0.000200 time 0.2523 (0.2520) loss 0.3731 (0.3634) grad_norm 62650.6758 (81671.0547) mem 14543MB +[2023-10-12 02:14:20 simmim_pretrain](main_simmim.py 218): INFO Train: [99/200][3000/6787] eta 0:15:54 lr 0.000200 time 0.2512 (0.2520) loss 0.3580 (0.3632) grad_norm 137453.0781 (85854.5703) mem 14543MB +[2023-10-12 02:16:26 simmim_pretrain](main_simmim.py 218): INFO Train: [99/200][3500/6787] eta 0:13:48 lr 0.000200 time 0.2500 (0.2520) loss 0.3592 (0.3628) grad_norm 276063.9062 (93401.7188) mem 14543MB +[2023-10-12 02:18:32 simmim_pretrain](main_simmim.py 218): INFO Train: [99/200][4000/6787] eta 0:11:42 lr 0.000200 time 0.2504 (0.2520) loss 0.3813 (0.3625) grad_norm 192551.4375 (103270.8281) mem 14543MB +[2023-10-12 02:20:38 simmim_pretrain](main_simmim.py 218): INFO Train: [99/200][4500/6787] eta 0:09:36 lr 0.000200 time 0.2540 (0.2520) loss 0.3612 (0.3623) grad_norm 157966.5938 (inf) mem 14543MB +[2023-10-12 02:22:44 simmim_pretrain](main_simmim.py 218): INFO Train: [99/200][5000/6787] eta 0:07:30 lr 0.000200 time 0.2591 (0.2520) loss 0.3608 (0.3623) grad_norm 126726.2969 (inf) mem 14543MB +[2023-10-12 02:24:50 simmim_pretrain](main_simmim.py 218): INFO Train: [99/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2490 (0.2520) loss 0.3756 (0.3622) grad_norm 124848.4609 (inf) mem 14543MB +[2023-10-12 02:26:57 simmim_pretrain](main_simmim.py 218): INFO Train: [99/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2599 (0.2521) loss 0.3770 (0.3623) grad_norm 144055.2500 (inf) mem 14543MB +[2023-10-12 02:29:03 simmim_pretrain](main_simmim.py 218): INFO Train: [99/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2597 (0.2521) loss 0.3378 (0.3622) grad_norm 181633.9844 (inf) mem 14543MB +[2023-10-12 02:30:16 simmim_pretrain](main_simmim.py 228): INFO EPOCH 99 training takes 0:28:31 +[2023-10-12 02:30:17 simmim_pretrain](main_simmim.py 218): INFO Train: [100/200][0/6787] eta 2:41:47 lr 0.000200 time 1.4303 (1.4303) loss 0.3415 (0.3415) grad_norm 178395.6406 (178395.6406) mem 14543MB +[2023-10-12 02:32:23 simmim_pretrain](main_simmim.py 218): INFO Train: [100/200][500/6787] eta 0:26:32 lr 0.000200 time 0.2483 (0.2533) loss 0.3766 (0.3603) grad_norm 277789.6875 (171691.6094) mem 14543MB +[2023-10-12 02:34:28 simmim_pretrain](main_simmim.py 218): INFO Train: [100/200][1000/6787] eta 0:24:20 lr 0.000200 time 0.2527 (0.2524) loss 0.3692 (0.3605) grad_norm 183919.0781 (181329.0156) mem 14543MB +[2023-10-12 02:36:34 simmim_pretrain](main_simmim.py 218): INFO Train: [100/200][1500/6787] eta 0:22:12 lr 0.000200 time 0.2463 (0.2520) loss 0.3559 (0.3600) grad_norm 154250.5781 (208835.6094) mem 14543MB +[2023-10-12 02:38:39 simmim_pretrain](main_simmim.py 218): INFO Train: [100/200][2000/6787] eta 0:20:05 lr 0.000200 time 0.2529 (0.2517) loss 0.3514 (0.3597) grad_norm 241654.4219 (221784.2344) mem 14543MB +[2023-10-12 02:40:45 simmim_pretrain](main_simmim.py 218): INFO Train: [100/200][2500/6787] eta 0:17:58 lr 0.000200 time 0.2483 (0.2516) loss 0.3692 (0.3592) grad_norm 243274.0000 (243056.1094) mem 14543MB +[2023-10-12 02:42:50 simmim_pretrain](main_simmim.py 218): INFO Train: [100/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2517 (0.2514) loss 0.3677 (0.3593) grad_norm 377185.2188 (255469.4688) mem 14543MB +[2023-10-12 02:44:56 simmim_pretrain](main_simmim.py 218): INFO Train: [100/200][3500/6787] eta 0:13:46 lr 0.000200 time 0.2503 (0.2514) loss 0.3582 (0.3592) grad_norm 296182.0938 (inf) mem 14543MB +[2023-10-12 02:47:02 simmim_pretrain](main_simmim.py 218): INFO Train: [100/200][4000/6787] eta 0:11:40 lr 0.000200 time 0.2520 (0.2514) loss 0.3682 (0.3592) grad_norm 281465.1562 (inf) mem 14543MB +[2023-10-12 02:49:07 simmim_pretrain](main_simmim.py 218): INFO Train: [100/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2464 (0.2514) loss 0.3634 (0.3592) grad_norm 170077.6562 (inf) mem 14543MB +[2023-10-12 02:51:13 simmim_pretrain](main_simmim.py 218): INFO Train: [100/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2512 (0.2514) loss 0.3968 (0.3593) grad_norm 97890.8047 (inf) mem 14543MB +[2023-10-12 02:53:19 simmim_pretrain](main_simmim.py 218): INFO Train: [100/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2502 (0.2514) loss 0.3701 (0.3593) grad_norm 229536.2969 (inf) mem 14543MB +[2023-10-12 02:55:25 simmim_pretrain](main_simmim.py 218): INFO Train: [100/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2537 (0.2514) loss 0.3568 (0.3593) grad_norm 325122.0625 (inf) mem 14543MB +[2023-10-12 02:57:33 simmim_pretrain](main_simmim.py 218): INFO Train: [100/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2569 (0.2518) loss 0.3482 (0.3593) grad_norm 341409.8438 (inf) mem 14543MB +[2023-10-12 02:58:47 simmim_pretrain](main_simmim.py 228): INFO EPOCH 100 training takes 0:28:31 +[2023-10-12 02:58:47 simmim_pretrain](utils.py 62): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_100.pth saving...... +[2023-10-12 02:58:48 simmim_pretrain](utils.py 64): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_100.pth saved !!! +[2023-10-12 02:58:49 simmim_pretrain](main_simmim.py 218): INFO Train: [101/200][0/6787] eta 2:30:44 lr 0.000200 time 1.3327 (1.3327) loss 0.3659 (0.3659) grad_norm 277496.2500 (277496.2500) mem 14543MB +[2023-10-12 03:00:55 simmim_pretrain](main_simmim.py 218): INFO Train: [101/200][500/6787] eta 0:26:33 lr 0.000200 time 0.2546 (0.2535) loss 0.3800 (0.3581) grad_norm 271890.7812 (374255.9688) mem 14543MB +[2023-10-12 03:03:01 simmim_pretrain](main_simmim.py 218): INFO Train: [101/200][1000/6787] eta 0:24:22 lr 0.000200 time 0.2483 (0.2527) loss 0.3794 (0.3588) grad_norm 330771.1562 (inf) mem 14543MB +[2023-10-12 03:05:08 simmim_pretrain](main_simmim.py 218): INFO Train: [101/200][1500/6787] eta 0:22:18 lr 0.000200 time 0.2548 (0.2531) loss 0.3560 (0.3586) grad_norm 174398.2344 (inf) mem 14543MB +[2023-10-12 03:07:15 simmim_pretrain](main_simmim.py 218): INFO Train: [101/200][2000/6787] eta 0:20:12 lr 0.000200 time 0.2535 (0.2533) loss 0.3580 (0.3588) grad_norm 265289.6562 (inf) mem 14543MB +[2023-10-12 03:09:22 simmim_pretrain](main_simmim.py 218): INFO Train: [101/200][2500/6787] eta 0:18:06 lr 0.000200 time 0.2548 (0.2534) loss 0.3587 (0.3592) grad_norm 203074.8594 (inf) mem 14543MB +[2023-10-12 03:11:29 simmim_pretrain](main_simmim.py 218): INFO Train: [101/200][3000/6787] eta 0:16:00 lr 0.000200 time 0.2545 (0.2535) loss 0.3532 (0.3594) grad_norm 335170.5625 (inf) mem 14543MB +[2023-10-12 03:13:35 simmim_pretrain](main_simmim.py 218): INFO Train: [101/200][3500/6787] eta 0:13:53 lr 0.000200 time 0.2531 (0.2535) loss 0.3747 (0.3595) grad_norm 210431.0156 (inf) mem 14543MB +[2023-10-12 03:15:43 simmim_pretrain](main_simmim.py 218): INFO Train: [101/200][4000/6787] eta 0:11:46 lr 0.000200 time 0.2533 (0.2536) loss 0.3657 (0.3595) grad_norm 286953.2188 (inf) mem 14543MB +[2023-10-12 03:17:49 simmim_pretrain](main_simmim.py 218): INFO Train: [101/200][4500/6787] eta 0:09:40 lr 0.000200 time 0.2526 (0.2536) loss 0.3536 (0.3594) grad_norm 355317.6562 (inf) mem 14543MB +[2023-10-12 03:19:56 simmim_pretrain](main_simmim.py 218): INFO Train: [101/200][5000/6787] eta 0:07:33 lr 0.000200 time 0.2541 (0.2537) loss 0.3372 (0.3593) grad_norm 365518.5312 (inf) mem 14543MB +[2023-10-12 03:22:03 simmim_pretrain](main_simmim.py 218): INFO Train: [101/200][5500/6787] eta 0:05:26 lr 0.000200 time 0.2524 (0.2537) loss 0.3708 (0.3595) grad_norm 208737.4062 (inf) mem 14543MB +[2023-10-12 03:24:10 simmim_pretrain](main_simmim.py 218): INFO Train: [101/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2541 (0.2537) loss 0.3531 (0.3595) grad_norm 82415.5078 (inf) mem 14543MB +[2023-10-12 03:26:17 simmim_pretrain](main_simmim.py 218): INFO Train: [101/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2535 (0.2537) loss 0.3573 (0.3596) grad_norm 162724.8438 (inf) mem 14543MB +[2023-10-12 03:27:30 simmim_pretrain](main_simmim.py 228): INFO EPOCH 101 training takes 0:28:42 +[2023-10-12 03:27:32 simmim_pretrain](main_simmim.py 218): INFO Train: [102/200][0/6787] eta 2:47:31 lr 0.000200 time 1.4810 (1.4810) loss 0.3605 (0.3605) grad_norm 239233.9844 (239233.9844) mem 14543MB +[2023-10-12 03:29:37 simmim_pretrain](main_simmim.py 218): INFO Train: [102/200][500/6787] eta 0:26:31 lr 0.000200 time 0.2485 (0.2531) loss 0.3649 (0.3589) grad_norm 235283.1406 (283245.0938) mem 14543MB +[2023-10-12 03:31:42 simmim_pretrain](main_simmim.py 218): INFO Train: [102/200][1000/6787] eta 0:24:17 lr 0.000200 time 0.2515 (0.2519) loss 0.3581 (0.3584) grad_norm 713913.6250 (330579.7812) mem 14543MB +[2023-10-12 03:33:48 simmim_pretrain](main_simmim.py 218): INFO Train: [102/200][1500/6787] eta 0:22:10 lr 0.000200 time 0.2515 (0.2516) loss 0.3636 (0.3587) grad_norm 139662.3594 (inf) mem 14543MB +[2023-10-12 03:35:54 simmim_pretrain](main_simmim.py 218): INFO Train: [102/200][2000/6787] eta 0:20:04 lr 0.000200 time 0.2500 (0.2515) loss 0.3528 (0.3598) grad_norm 173560.1562 (inf) mem 14543MB +[2023-10-12 03:37:59 simmim_pretrain](main_simmim.py 218): INFO Train: [102/200][2500/6787] eta 0:17:57 lr 0.000200 time 0.2527 (0.2515) loss 0.3590 (0.3602) grad_norm 130914.5703 (inf) mem 14543MB +[2023-10-12 03:40:05 simmim_pretrain](main_simmim.py 218): INFO Train: [102/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2551 (0.2514) loss 0.3533 (0.3607) grad_norm 103083.4766 (inf) mem 14543MB +[2023-10-12 03:42:12 simmim_pretrain](main_simmim.py 218): INFO Train: [102/200][3500/6787] eta 0:13:47 lr 0.000200 time 0.2571 (0.2517) loss 0.3387 (0.3608) grad_norm 133232.7812 (inf) mem 14543MB +[2023-10-12 03:44:20 simmim_pretrain](main_simmim.py 218): INFO Train: [102/200][4000/6787] eta 0:11:43 lr 0.000200 time 0.2510 (0.2523) loss 0.3599 (0.3608) grad_norm 233490.5938 (inf) mem 14543MB +[2023-10-12 03:46:28 simmim_pretrain](main_simmim.py 218): INFO Train: [102/200][4500/6787] eta 0:09:37 lr 0.000200 time 0.2579 (0.2527) loss 0.3705 (0.3607) grad_norm 205447.2500 (inf) mem 14543MB +[2023-10-12 03:48:36 simmim_pretrain](main_simmim.py 218): INFO Train: [102/200][5000/6787] eta 0:07:32 lr 0.000200 time 0.2500 (0.2530) loss 0.3592 (0.3606) grad_norm 286590.4375 (inf) mem 14543MB +[2023-10-12 03:50:44 simmim_pretrain](main_simmim.py 218): INFO Train: [102/200][5500/6787] eta 0:05:25 lr 0.000200 time 0.2504 (0.2533) loss 0.3661 (0.3605) grad_norm 182781.5938 (inf) mem 14543MB +[2023-10-12 03:52:52 simmim_pretrain](main_simmim.py 218): INFO Train: [102/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2547 (0.2535) loss 0.3656 (0.3603) grad_norm 478015.0312 (inf) mem 14543MB +[2023-10-12 03:55:00 simmim_pretrain](main_simmim.py 218): INFO Train: [102/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2554 (0.2537) loss 0.3759 (0.3602) grad_norm 283926.3750 (inf) mem 14543MB +[2023-10-12 03:56:14 simmim_pretrain](main_simmim.py 228): INFO EPOCH 102 training takes 0:28:43 +[2023-10-12 03:56:15 simmim_pretrain](main_simmim.py 218): INFO Train: [103/200][0/6787] eta 2:45:59 lr 0.000200 time 1.4675 (1.4675) loss 0.3634 (0.3634) grad_norm 388951.9062 (388951.9062) mem 14543MB +[2023-10-12 03:58:20 simmim_pretrain](main_simmim.py 218): INFO Train: [103/200][500/6787] eta 0:26:32 lr 0.000200 time 0.2503 (0.2532) loss 0.3666 (0.3591) grad_norm 154496.8594 (246824.7188) mem 14543MB +[2023-10-12 04:00:26 simmim_pretrain](main_simmim.py 218): INFO Train: [103/200][1000/6787] eta 0:24:19 lr 0.000200 time 0.2494 (0.2521) loss 0.3690 (0.3599) grad_norm 312323.1875 (240409.6250) mem 14543MB +[2023-10-12 04:02:32 simmim_pretrain](main_simmim.py 218): INFO Train: [103/200][1500/6787] eta 0:22:11 lr 0.000200 time 0.2484 (0.2518) loss 0.3775 (0.3602) grad_norm 263561.6250 (236388.0312) mem 14543MB +[2023-10-12 04:04:37 simmim_pretrain](main_simmim.py 218): INFO Train: [103/200][2000/6787] eta 0:20:04 lr 0.000200 time 0.2511 (0.2517) loss 0.3555 (0.3601) grad_norm 306334.1875 (261472.4531) mem 14543MB +[2023-10-12 04:06:43 simmim_pretrain](main_simmim.py 218): INFO Train: [103/200][2500/6787] eta 0:17:58 lr 0.000200 time 0.2511 (0.2517) loss 0.3403 (0.3596) grad_norm 410641.6875 (280779.8125) mem 14543MB +[2023-10-12 04:08:49 simmim_pretrain](main_simmim.py 218): INFO Train: [103/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2567 (0.2516) loss 0.3648 (0.3594) grad_norm 538252.2500 (299305.2188) mem 14543MB +[2023-10-12 04:10:54 simmim_pretrain](main_simmim.py 218): INFO Train: [103/200][3500/6787] eta 0:13:46 lr 0.000200 time 0.2521 (0.2515) loss 0.3670 (0.3592) grad_norm 327268.6875 (312771.9062) mem 14543MB +[2023-10-12 04:13:00 simmim_pretrain](main_simmim.py 218): INFO Train: [103/200][4000/6787] eta 0:11:40 lr 0.000200 time 0.2463 (0.2515) loss 0.3620 (0.3591) grad_norm 502099.7500 (inf) mem 14543MB +[2023-10-12 04:15:05 simmim_pretrain](main_simmim.py 218): INFO Train: [103/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2498 (0.2514) loss 0.3493 (0.3592) grad_norm 372024.2812 (inf) mem 14543MB +[2023-10-12 04:17:11 simmim_pretrain](main_simmim.py 218): INFO Train: [103/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2508 (0.2513) loss 0.3617 (0.3590) grad_norm 334053.9688 (inf) mem 14543MB +[2023-10-12 04:19:16 simmim_pretrain](main_simmim.py 218): INFO Train: [103/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2471 (0.2513) loss 0.3563 (0.3589) grad_norm 511195.5938 (inf) mem 14543MB +[2023-10-12 04:21:21 simmim_pretrain](main_simmim.py 218): INFO Train: [103/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2466 (0.2513) loss 0.3725 (0.3589) grad_norm 328542.8750 (inf) mem 14543MB +[2023-10-12 04:23:27 simmim_pretrain](main_simmim.py 218): INFO Train: [103/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2521 (0.2513) loss 0.3769 (0.3589) grad_norm 317993.0938 (inf) mem 14543MB +[2023-10-12 04:24:40 simmim_pretrain](main_simmim.py 228): INFO EPOCH 103 training takes 0:28:26 +[2023-10-12 04:24:41 simmim_pretrain](main_simmim.py 218): INFO Train: [104/200][0/6787] eta 2:45:45 lr 0.000200 time 1.4653 (1.4653) loss 0.3562 (0.3562) grad_norm 331690.6875 (331690.6875) mem 14543MB +[2023-10-12 04:26:46 simmim_pretrain](main_simmim.py 218): INFO Train: [104/200][500/6787] eta 0:26:29 lr 0.000200 time 0.2495 (0.2528) loss 0.3557 (0.3609) grad_norm 312228.6562 (253819.3906) mem 14543MB +[2023-10-12 04:28:52 simmim_pretrain](main_simmim.py 218): INFO Train: [104/200][1000/6787] eta 0:24:17 lr 0.000200 time 0.2522 (0.2518) loss 0.3658 (0.3600) grad_norm 129873.7891 (247006.0625) mem 14543MB +[2023-10-12 04:30:57 simmim_pretrain](main_simmim.py 218): INFO Train: [104/200][1500/6787] eta 0:22:10 lr 0.000200 time 0.2526 (0.2516) loss 0.3589 (0.3598) grad_norm 264976.5000 (inf) mem 14543MB +[2023-10-12 04:33:03 simmim_pretrain](main_simmim.py 218): INFO Train: [104/200][2000/6787] eta 0:20:03 lr 0.000200 time 0.2498 (0.2514) loss 0.3463 (0.3599) grad_norm 259917.6719 (inf) mem 14543MB +[2023-10-12 04:35:08 simmim_pretrain](main_simmim.py 218): INFO Train: [104/200][2500/6787] eta 0:17:57 lr 0.000200 time 0.2497 (0.2514) loss 0.3566 (0.3600) grad_norm 278851.9375 (inf) mem 14543MB +[2023-10-12 04:37:14 simmim_pretrain](main_simmim.py 218): INFO Train: [104/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2457 (0.2515) loss 0.3486 (0.3601) grad_norm 412786.8125 (inf) mem 14543MB +[2023-10-12 04:39:20 simmim_pretrain](main_simmim.py 218): INFO Train: [104/200][3500/6787] eta 0:13:46 lr 0.000200 time 0.2509 (0.2515) loss 0.3594 (0.3602) grad_norm 536714.4375 (inf) mem 14543MB +[2023-10-12 04:41:26 simmim_pretrain](main_simmim.py 218): INFO Train: [104/200][4000/6787] eta 0:11:40 lr 0.000200 time 0.2519 (0.2515) loss 0.3548 (0.3600) grad_norm 200400.7500 (inf) mem 14543MB +[2023-10-12 04:43:32 simmim_pretrain](main_simmim.py 218): INFO Train: [104/200][4500/6787] eta 0:09:35 lr 0.000200 time 0.2588 (0.2515) loss 0.3731 (0.3600) grad_norm 433506.9062 (inf) mem 14543MB +[2023-10-12 04:45:37 simmim_pretrain](main_simmim.py 218): INFO Train: [104/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2452 (0.2514) loss 0.3589 (0.3597) grad_norm 257062.3594 (inf) mem 14543MB +[2023-10-12 04:47:42 simmim_pretrain](main_simmim.py 218): INFO Train: [104/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2592 (0.2513) loss 0.3514 (0.3596) grad_norm 586824.7500 (inf) mem 14543MB +[2023-10-12 04:49:47 simmim_pretrain](main_simmim.py 218): INFO Train: [104/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2457 (0.2513) loss 0.3551 (0.3596) grad_norm 272321.3750 (inf) mem 14543MB +[2023-10-12 04:51:53 simmim_pretrain](main_simmim.py 218): INFO Train: [104/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2519 (0.2512) loss 0.3628 (0.3596) grad_norm 282600.7188 (inf) mem 14543MB +[2023-10-12 04:53:05 simmim_pretrain](main_simmim.py 228): INFO EPOCH 104 training takes 0:28:25 +[2023-10-12 04:53:06 simmim_pretrain](main_simmim.py 218): INFO Train: [105/200][0/6787] eta 2:54:23 lr 0.000200 time 1.5417 (1.5417) loss 0.5384 (0.5384) grad_norm 2427.8015 (2427.8015) mem 14543MB +[2023-10-12 04:55:11 simmim_pretrain](main_simmim.py 218): INFO Train: [105/200][500/6787] eta 0:26:23 lr 0.000200 time 0.2588 (0.2518) loss 0.4678 (0.5064) grad_norm 18258.2773 (14356.2832) mem 14543MB +[2023-10-12 04:57:16 simmim_pretrain](main_simmim.py 218): INFO Train: [105/200][1000/6787] eta 0:24:11 lr 0.000200 time 0.2500 (0.2509) loss 0.4496 (0.4927) grad_norm 28690.3828 (19231.4727) mem 14543MB +[2023-10-12 04:59:21 simmim_pretrain](main_simmim.py 218): INFO Train: [105/200][1500/6787] eta 0:22:05 lr 0.000200 time 0.2470 (0.2506) loss 0.4636 (0.4832) grad_norm 27178.0293 (21687.2891) mem 14543MB +[2023-10-12 05:01:26 simmim_pretrain](main_simmim.py 218): INFO Train: [105/200][2000/6787] eta 0:19:59 lr 0.000200 time 0.2515 (0.2506) loss 0.3869 (0.4685) grad_norm 17513.5566 (22290.4648) mem 14543MB +[2023-10-12 05:03:31 simmim_pretrain](main_simmim.py 218): INFO Train: [105/200][2500/6787] eta 0:17:54 lr 0.000200 time 0.2479 (0.2505) loss 0.3787 (0.4520) grad_norm 40133.2852 (23639.5371) mem 14543MB +[2023-10-12 05:05:37 simmim_pretrain](main_simmim.py 218): INFO Train: [105/200][3000/6787] eta 0:15:49 lr 0.000200 time 0.2528 (0.2506) loss 0.3680 (0.4389) grad_norm 22285.3926 (24396.3906) mem 14543MB +[2023-10-12 05:07:42 simmim_pretrain](main_simmim.py 218): INFO Train: [105/200][3500/6787] eta 0:13:43 lr 0.000200 time 0.2498 (0.2506) loss 0.3663 (0.4291) grad_norm 34334.0352 (24714.7715) mem 14543MB +[2023-10-12 05:09:48 simmim_pretrain](main_simmim.py 218): INFO Train: [105/200][4000/6787] eta 0:11:38 lr 0.000200 time 0.2502 (0.2507) loss 0.3597 (0.4215) grad_norm 32990.3320 (25432.3164) mem 14543MB +[2023-10-12 05:11:53 simmim_pretrain](main_simmim.py 218): INFO Train: [105/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2489 (0.2507) loss 0.3578 (0.4151) grad_norm 43911.1484 (26786.4238) mem 14543MB +[2023-10-12 05:13:59 simmim_pretrain](main_simmim.py 218): INFO Train: [105/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2501 (0.2508) loss 0.3374 (0.4100) grad_norm 35386.1406 (28362.8418) mem 14543MB +[2023-10-12 05:16:04 simmim_pretrain](main_simmim.py 218): INFO Train: [105/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2518 (0.2508) loss 0.3786 (0.4059) grad_norm 35993.1016 (29983.7969) mem 14543MB +[2023-10-12 05:18:10 simmim_pretrain](main_simmim.py 218): INFO Train: [105/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2540 (0.2508) loss 0.3670 (0.4023) grad_norm 40629.6562 (31884.3008) mem 14543MB +[2023-10-12 05:20:16 simmim_pretrain](main_simmim.py 218): INFO Train: [105/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2541 (0.2509) loss 0.3624 (0.3993) grad_norm 142373.7188 (35564.3711) mem 14543MB +[2023-10-12 05:21:28 simmim_pretrain](main_simmim.py 228): INFO EPOCH 105 training takes 0:28:23 +[2023-10-12 05:21:29 simmim_pretrain](main_simmim.py 218): INFO Train: [106/200][0/6787] eta 2:34:57 lr 0.000200 time 1.3699 (1.3699) loss 0.3637 (0.3637) grad_norm 107006.6250 (107006.6250) mem 14543MB +[2023-10-12 05:23:35 simmim_pretrain](main_simmim.py 218): INFO Train: [106/200][500/6787] eta 0:26:30 lr 0.000200 time 0.2496 (0.2529) loss 0.3541 (0.3615) grad_norm 70954.7500 (79706.2109) mem 14543MB +[2023-10-12 05:25:40 simmim_pretrain](main_simmim.py 218): INFO Train: [106/200][1000/6787] eta 0:24:17 lr 0.000200 time 0.2501 (0.2519) loss 0.3731 (0.3618) grad_norm 82121.7578 (81669.6719) mem 14543MB +[2023-10-12 05:27:45 simmim_pretrain](main_simmim.py 218): INFO Train: [106/200][1500/6787] eta 0:22:09 lr 0.000200 time 0.2522 (0.2514) loss 0.3672 (0.3614) grad_norm 98132.1172 (92123.9922) mem 14543MB +[2023-10-12 05:29:51 simmim_pretrain](main_simmim.py 218): INFO Train: [106/200][2000/6787] eta 0:20:02 lr 0.000200 time 0.2461 (0.2512) loss 0.3619 (0.3611) grad_norm 108209.9219 (103156.9297) mem 14543MB +[2023-10-12 05:31:56 simmim_pretrain](main_simmim.py 218): INFO Train: [106/200][2500/6787] eta 0:17:55 lr 0.000200 time 0.2487 (0.2510) loss 0.3692 (0.3606) grad_norm 156429.3594 (120361.1094) mem 14543MB +[2023-10-12 05:34:01 simmim_pretrain](main_simmim.py 218): INFO Train: [106/200][3000/6787] eta 0:15:50 lr 0.000200 time 0.2536 (0.2509) loss 0.3554 (0.3604) grad_norm 113663.1094 (126584.4766) mem 14543MB +[2023-10-12 05:36:06 simmim_pretrain](main_simmim.py 218): INFO Train: [106/200][3500/6787] eta 0:13:44 lr 0.000200 time 0.2474 (0.2509) loss 0.3676 (0.3603) grad_norm 152326.6094 (139064.1719) mem 14543MB +[2023-10-12 05:38:12 simmim_pretrain](main_simmim.py 218): INFO Train: [106/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2495 (0.2508) loss 0.3594 (0.3600) grad_norm 213698.9375 (152384.1719) mem 14543MB +[2023-10-12 05:40:17 simmim_pretrain](main_simmim.py 218): INFO Train: [106/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2524 (0.2508) loss 0.3544 (0.3599) grad_norm 368539.8125 (166747.6250) mem 14543MB +[2023-10-12 05:42:22 simmim_pretrain](main_simmim.py 218): INFO Train: [106/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2535 (0.2508) loss 0.3537 (0.3597) grad_norm 456558.5938 (182570.6094) mem 14543MB +[2023-10-12 05:44:28 simmim_pretrain](main_simmim.py 218): INFO Train: [106/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2543 (0.2508) loss 0.3507 (0.3596) grad_norm 244715.2031 (inf) mem 14543MB +[2023-10-12 05:46:33 simmim_pretrain](main_simmim.py 218): INFO Train: [106/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2484 (0.2508) loss 0.3396 (0.3594) grad_norm 326379.0938 (inf) mem 14543MB +[2023-10-12 05:48:38 simmim_pretrain](main_simmim.py 218): INFO Train: [106/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2486 (0.2507) loss 0.3535 (0.3593) grad_norm 285244.6250 (inf) mem 14543MB +[2023-10-12 05:49:51 simmim_pretrain](main_simmim.py 228): INFO EPOCH 106 training takes 0:28:22 +[2023-10-12 05:49:52 simmim_pretrain](main_simmim.py 218): INFO Train: [107/200][0/6787] eta 2:49:51 lr 0.000200 time 1.5015 (1.5015) loss 0.3506 (0.3506) grad_norm 316355.0625 (316355.0625) mem 14543MB +[2023-10-12 05:51:57 simmim_pretrain](main_simmim.py 218): INFO Train: [107/200][500/6787] eta 0:26:28 lr 0.000200 time 0.2463 (0.2526) loss 0.3558 (0.3577) grad_norm 578327.2500 (inf) mem 14543MB +[2023-10-12 05:54:02 simmim_pretrain](main_simmim.py 218): INFO Train: [107/200][1000/6787] eta 0:24:16 lr 0.000200 time 0.2483 (0.2517) loss 0.3654 (0.3587) grad_norm 231278.6250 (inf) mem 14543MB +[2023-10-12 05:56:08 simmim_pretrain](main_simmim.py 218): INFO Train: [107/200][1500/6787] eta 0:22:07 lr 0.000200 time 0.2510 (0.2512) loss 0.3700 (0.3591) grad_norm 187624.9844 (inf) mem 14543MB +[2023-10-12 05:58:13 simmim_pretrain](main_simmim.py 218): INFO Train: [107/200][2000/6787] eta 0:20:01 lr 0.000200 time 0.2550 (0.2510) loss 0.3776 (0.3596) grad_norm 314052.6562 (inf) mem 14543MB +[2023-10-12 06:00:18 simmim_pretrain](main_simmim.py 218): INFO Train: [107/200][2500/6787] eta 0:17:55 lr 0.000200 time 0.2494 (0.2509) loss 0.3720 (0.3596) grad_norm 129498.7891 (inf) mem 14543MB +[2023-10-12 06:02:23 simmim_pretrain](main_simmim.py 218): INFO Train: [107/200][3000/6787] eta 0:15:49 lr 0.000200 time 0.2510 (0.2508) loss 0.3557 (0.3596) grad_norm 204868.2344 (inf) mem 14543MB +[2023-10-12 06:04:28 simmim_pretrain](main_simmim.py 218): INFO Train: [107/200][3500/6787] eta 0:13:43 lr 0.000200 time 0.2475 (0.2507) loss 0.3616 (0.3596) grad_norm 200137.6562 (inf) mem 14543MB +[2023-10-12 06:06:33 simmim_pretrain](main_simmim.py 218): INFO Train: [107/200][4000/6787] eta 0:11:38 lr 0.000200 time 0.2499 (0.2507) loss 0.3693 (0.3596) grad_norm 111547.7109 (inf) mem 14543MB +[2023-10-12 06:08:39 simmim_pretrain](main_simmim.py 218): INFO Train: [107/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2448 (0.2507) loss 0.3844 (0.3597) grad_norm 409323.9375 (inf) mem 14543MB +[2023-10-12 06:10:44 simmim_pretrain](main_simmim.py 218): INFO Train: [107/200][5000/6787] eta 0:07:27 lr 0.000200 time 0.2502 (0.2506) loss 0.3459 (0.3598) grad_norm 171679.5156 (inf) mem 14543MB +[2023-10-12 06:12:49 simmim_pretrain](main_simmim.py 218): INFO Train: [107/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2484 (0.2507) loss 0.3553 (0.3597) grad_norm 234578.7031 (inf) mem 14543MB +[2023-10-12 06:14:55 simmim_pretrain](main_simmim.py 218): INFO Train: [107/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2516 (0.2507) loss 0.3621 (0.3596) grad_norm 414758.6250 (inf) mem 14543MB +[2023-10-12 06:17:00 simmim_pretrain](main_simmim.py 218): INFO Train: [107/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2508 (0.2507) loss 0.3619 (0.3594) grad_norm 418682.9375 (inf) mem 14543MB +[2023-10-12 06:18:13 simmim_pretrain](main_simmim.py 228): INFO EPOCH 107 training takes 0:28:22 +[2023-10-12 06:18:14 simmim_pretrain](main_simmim.py 218): INFO Train: [108/200][0/6787] eta 2:47:03 lr 0.000200 time 1.4769 (1.4769) loss 0.3466 (0.3466) grad_norm 216161.0469 (216161.0469) mem 14543MB +[2023-10-12 06:20:20 simmim_pretrain](main_simmim.py 218): INFO Train: [108/200][500/6787] eta 0:26:31 lr 0.000200 time 0.2471 (0.2531) loss 0.3505 (0.3572) grad_norm 317402.1250 (435890.6250) mem 14543MB +[2023-10-12 06:22:25 simmim_pretrain](main_simmim.py 218): INFO Train: [108/200][1000/6787] eta 0:24:17 lr 0.000200 time 0.2541 (0.2519) loss 0.3769 (0.3583) grad_norm 199202.3281 (inf) mem 14543MB +[2023-10-12 06:24:30 simmim_pretrain](main_simmim.py 218): INFO Train: [108/200][1500/6787] eta 0:22:09 lr 0.000200 time 0.2504 (0.2515) loss 0.3549 (0.3586) grad_norm 323367.7188 (inf) mem 14543MB +[2023-10-12 06:26:36 simmim_pretrain](main_simmim.py 218): INFO Train: [108/200][2000/6787] eta 0:20:02 lr 0.000200 time 0.2494 (0.2513) loss 0.3648 (0.3591) grad_norm 212898.8438 (inf) mem 14543MB +[2023-10-12 06:28:41 simmim_pretrain](main_simmim.py 218): INFO Train: [108/200][2500/6787] eta 0:17:56 lr 0.000200 time 0.2504 (0.2511) loss 0.3701 (0.3591) grad_norm 251535.2812 (inf) mem 14543MB +[2023-10-12 06:30:46 simmim_pretrain](main_simmim.py 218): INFO Train: [108/200][3000/6787] eta 0:15:50 lr 0.000200 time 0.2509 (0.2511) loss 0.3601 (0.3589) grad_norm 176810.5938 (inf) mem 14543MB +[2023-10-12 06:32:51 simmim_pretrain](main_simmim.py 218): INFO Train: [108/200][3500/6787] eta 0:13:45 lr 0.000200 time 0.2554 (0.2510) loss 0.3537 (0.3589) grad_norm 206705.0938 (inf) mem 14543MB +[2023-10-12 06:34:57 simmim_pretrain](main_simmim.py 218): INFO Train: [108/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2590 (0.2509) loss 0.3576 (0.3587) grad_norm 494662.0938 (inf) mem 14543MB +[2023-10-12 06:37:02 simmim_pretrain](main_simmim.py 218): INFO Train: [108/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2505 (0.2509) loss 0.3723 (0.3587) grad_norm 213262.2031 (inf) mem 14543MB +[2023-10-12 06:39:07 simmim_pretrain](main_simmim.py 218): INFO Train: [108/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2511 (0.2508) loss 0.3730 (0.3586) grad_norm 352535.3750 (inf) mem 14543MB +[2023-10-12 06:41:12 simmim_pretrain](main_simmim.py 218): INFO Train: [108/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2485 (0.2508) loss 0.3594 (0.3585) grad_norm 414309.1875 (inf) mem 14543MB +[2023-10-12 06:43:17 simmim_pretrain](main_simmim.py 218): INFO Train: [108/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2512 (0.2507) loss 0.3724 (0.3585) grad_norm 244333.7656 (inf) mem 14543MB +[2023-10-12 06:45:23 simmim_pretrain](main_simmim.py 218): INFO Train: [108/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2468 (0.2507) loss 0.3579 (0.3585) grad_norm 219809.8594 (inf) mem 14543MB +[2023-10-12 06:46:35 simmim_pretrain](main_simmim.py 228): INFO EPOCH 108 training takes 0:28:21 +[2023-10-12 06:46:36 simmim_pretrain](main_simmim.py 218): INFO Train: [109/200][0/6787] eta 2:41:43 lr 0.000200 time 1.4297 (1.4297) loss 0.3473 (0.3473) grad_norm 258234.1094 (258234.1094) mem 14543MB +[2023-10-12 06:48:41 simmim_pretrain](main_simmim.py 218): INFO Train: [109/200][500/6787] eta 0:26:26 lr 0.000200 time 0.2482 (0.2523) loss 0.3687 (0.3605) grad_norm 356237.1875 (215263.0781) mem 14543MB +[2023-10-12 06:50:46 simmim_pretrain](main_simmim.py 218): INFO Train: [109/200][1000/6787] eta 0:24:13 lr 0.000200 time 0.2487 (0.2512) loss 0.3492 (0.3602) grad_norm 235034.3906 (225949.5469) mem 14543MB +[2023-10-12 06:52:51 simmim_pretrain](main_simmim.py 218): INFO Train: [109/200][1500/6787] eta 0:22:06 lr 0.000200 time 0.2459 (0.2509) loss 0.3704 (0.3602) grad_norm 202601.5469 (inf) mem 14543MB +[2023-10-12 06:54:56 simmim_pretrain](main_simmim.py 218): INFO Train: [109/200][2000/6787] eta 0:20:00 lr 0.000200 time 0.2496 (0.2507) loss 0.3417 (0.3603) grad_norm 274599.2188 (inf) mem 14543MB +[2023-10-12 06:57:02 simmim_pretrain](main_simmim.py 218): INFO Train: [109/200][2500/6787] eta 0:17:54 lr 0.000200 time 0.2581 (0.2507) loss 0.3493 (0.3604) grad_norm 163099.1562 (inf) mem 14543MB +[2023-10-12 06:59:07 simmim_pretrain](main_simmim.py 218): INFO Train: [109/200][3000/6787] eta 0:15:48 lr 0.000200 time 0.2528 (0.2506) loss 0.3497 (0.3603) grad_norm 240821.7969 (inf) mem 14543MB +[2023-10-12 07:01:12 simmim_pretrain](main_simmim.py 218): INFO Train: [109/200][3500/6787] eta 0:13:43 lr 0.000200 time 0.2484 (0.2505) loss 0.3735 (0.3602) grad_norm 322012.6250 (inf) mem 14543MB +[2023-10-12 07:03:17 simmim_pretrain](main_simmim.py 218): INFO Train: [109/200][4000/6787] eta 0:11:38 lr 0.000200 time 0.2524 (0.2506) loss 0.3626 (0.3600) grad_norm 196154.6875 (inf) mem 14543MB +[2023-10-12 07:05:23 simmim_pretrain](main_simmim.py 218): INFO Train: [109/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2559 (0.2506) loss 0.3628 (0.3597) grad_norm 362395.7188 (inf) mem 14543MB +[2023-10-12 07:07:28 simmim_pretrain](main_simmim.py 218): INFO Train: [109/200][5000/6787] eta 0:07:27 lr 0.000200 time 0.2537 (0.2506) loss 0.3495 (0.3595) grad_norm 383274.4062 (inf) mem 14543MB +[2023-10-12 07:09:33 simmim_pretrain](main_simmim.py 218): INFO Train: [109/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2546 (0.2506) loss 0.3635 (0.3594) grad_norm 354818.5938 (inf) mem 14543MB +[2023-10-12 07:11:39 simmim_pretrain](main_simmim.py 218): INFO Train: [109/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2589 (0.2506) loss 0.3711 (0.3593) grad_norm 598589.8750 (inf) mem 14543MB +[2023-10-12 07:13:44 simmim_pretrain](main_simmim.py 218): INFO Train: [109/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2480 (0.2506) loss 0.3491 (0.3591) grad_norm 368270.1875 (inf) mem 14543MB +[2023-10-12 07:14:56 simmim_pretrain](main_simmim.py 228): INFO EPOCH 109 training takes 0:28:21 +[2023-10-12 07:14:58 simmim_pretrain](main_simmim.py 218): INFO Train: [110/200][0/6787] eta 2:40:02 lr 0.000200 time 1.4149 (1.4149) loss 0.3750 (0.3750) grad_norm 858921.2500 (858921.2500) mem 14543MB +[2023-10-12 07:17:03 simmim_pretrain](main_simmim.py 218): INFO Train: [110/200][500/6787] eta 0:26:26 lr 0.000200 time 0.2588 (0.2524) loss 0.3347 (0.3577) grad_norm 551103.6875 (433006.8438) mem 14543MB +[2023-10-12 07:19:08 simmim_pretrain](main_simmim.py 218): INFO Train: [110/200][1000/6787] eta 0:24:14 lr 0.000200 time 0.2511 (0.2513) loss 0.3618 (0.3576) grad_norm 418321.0938 (421670.2500) mem 14543MB +[2023-10-12 07:21:13 simmim_pretrain](main_simmim.py 218): INFO Train: [110/200][1500/6787] eta 0:22:06 lr 0.000200 time 0.2461 (0.2509) loss 0.3757 (0.3577) grad_norm 719537.0625 (inf) mem 14543MB +[2023-10-12 07:23:18 simmim_pretrain](main_simmim.py 218): INFO Train: [110/200][2000/6787] eta 0:20:00 lr 0.000200 time 0.2482 (0.2508) loss 0.3536 (0.3580) grad_norm 291218.4688 (inf) mem 14543MB +[2023-10-12 07:25:23 simmim_pretrain](main_simmim.py 218): INFO Train: [110/200][2500/6787] eta 0:17:54 lr 0.000200 time 0.2469 (0.2507) loss 0.3558 (0.3586) grad_norm 141681.6250 (inf) mem 14543MB +[2023-10-12 07:27:28 simmim_pretrain](main_simmim.py 218): INFO Train: [110/200][3000/6787] eta 0:15:49 lr 0.000200 time 0.2459 (0.2506) loss 0.3619 (0.3588) grad_norm 186532.3594 (inf) mem 14543MB +[2023-10-12 07:29:34 simmim_pretrain](main_simmim.py 218): INFO Train: [110/200][3500/6787] eta 0:13:43 lr 0.000200 time 0.2591 (0.2506) loss 0.3370 (0.3589) grad_norm 225808.6250 (inf) mem 14543MB +[2023-10-12 07:31:39 simmim_pretrain](main_simmim.py 218): INFO Train: [110/200][4000/6787] eta 0:11:38 lr 0.000200 time 0.2473 (0.2506) loss 0.3567 (0.3590) grad_norm 679506.2500 (inf) mem 14543MB +[2023-10-12 07:33:44 simmim_pretrain](main_simmim.py 218): INFO Train: [110/200][4500/6787] eta 0:09:32 lr 0.000200 time 0.2451 (0.2505) loss 0.3643 (0.3591) grad_norm 210184.6406 (inf) mem 14543MB +[2023-10-12 07:35:49 simmim_pretrain](main_simmim.py 218): INFO Train: [110/200][5000/6787] eta 0:07:27 lr 0.000200 time 0.2460 (0.2505) loss 0.3540 (0.3590) grad_norm 498058.8750 (inf) mem 14543MB +[2023-10-12 07:37:54 simmim_pretrain](main_simmim.py 218): INFO Train: [110/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2467 (0.2505) loss 0.3488 (0.3590) grad_norm 514772.0938 (inf) mem 14543MB +[2023-10-12 07:39:59 simmim_pretrain](main_simmim.py 218): INFO Train: [110/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2523 (0.2505) loss 0.3823 (0.3590) grad_norm 313216.3438 (inf) mem 14543MB +[2023-10-12 07:42:05 simmim_pretrain](main_simmim.py 218): INFO Train: [110/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2509 (0.2505) loss 0.3571 (0.3590) grad_norm 228159.7500 (inf) mem 14543MB +[2023-10-12 07:43:17 simmim_pretrain](main_simmim.py 228): INFO EPOCH 110 training takes 0:28:20 +[2023-10-12 07:43:18 simmim_pretrain](main_simmim.py 218): INFO Train: [111/200][0/6787] eta 2:44:51 lr 0.000200 time 1.4574 (1.4574) loss 0.3666 (0.3666) grad_norm 255650.8281 (255650.8281) mem 14543MB +[2023-10-12 07:45:23 simmim_pretrain](main_simmim.py 218): INFO Train: [111/200][500/6787] eta 0:26:25 lr 0.000200 time 0.2515 (0.2523) loss 0.3599 (0.3585) grad_norm 255126.9375 (251659.5156) mem 14543MB +[2023-10-12 07:47:28 simmim_pretrain](main_simmim.py 218): INFO Train: [111/200][1000/6787] eta 0:24:12 lr 0.000200 time 0.2556 (0.2511) loss 0.3324 (0.3592) grad_norm 223996.7031 (244435.8281) mem 14543MB +[2023-10-12 07:49:33 simmim_pretrain](main_simmim.py 218): INFO Train: [111/200][1500/6787] eta 0:22:05 lr 0.000200 time 0.2468 (0.2508) loss 0.3531 (0.3593) grad_norm 177591.2188 (254222.7969) mem 14543MB +[2023-10-12 07:51:38 simmim_pretrain](main_simmim.py 218): INFO Train: [111/200][2000/6787] eta 0:19:59 lr 0.000200 time 0.2447 (0.2505) loss 0.3572 (0.3590) grad_norm 293143.2500 (279598.5312) mem 14543MB +[2023-10-12 07:53:43 simmim_pretrain](main_simmim.py 218): INFO Train: [111/200][2500/6787] eta 0:17:53 lr 0.000200 time 0.2485 (0.2504) loss 0.3643 (0.3590) grad_norm 393097.0625 (293700.0938) mem 14543MB +[2023-10-12 07:55:48 simmim_pretrain](main_simmim.py 218): INFO Train: [111/200][3000/6787] eta 0:15:48 lr 0.000200 time 0.2476 (0.2504) loss 0.3497 (0.3586) grad_norm 633481.9375 (315096.1250) mem 14543MB +[2023-10-12 07:57:54 simmim_pretrain](main_simmim.py 218): INFO Train: [111/200][3500/6787] eta 0:13:43 lr 0.000200 time 0.2511 (0.2504) loss 0.3391 (0.3584) grad_norm 891781.8750 (inf) mem 14543MB +[2023-10-12 07:59:59 simmim_pretrain](main_simmim.py 218): INFO Train: [111/200][4000/6787] eta 0:11:37 lr 0.000200 time 0.2486 (0.2504) loss 0.3593 (0.3584) grad_norm 750587.6875 (inf) mem 14543MB +[2023-10-12 08:02:04 simmim_pretrain](main_simmim.py 218): INFO Train: [111/200][4500/6787] eta 0:09:32 lr 0.000200 time 0.2475 (0.2504) loss 0.3471 (0.3586) grad_norm 156999.4844 (inf) mem 14543MB +[2023-10-12 08:04:09 simmim_pretrain](main_simmim.py 218): INFO Train: [111/200][5000/6787] eta 0:07:27 lr 0.000200 time 0.2505 (0.2504) loss 0.3499 (0.3587) grad_norm 231542.8906 (inf) mem 14543MB +[2023-10-12 08:06:14 simmim_pretrain](main_simmim.py 218): INFO Train: [111/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2502 (0.2504) loss 0.3472 (0.3589) grad_norm 91620.5469 (inf) mem 14543MB +[2023-10-12 08:08:20 simmim_pretrain](main_simmim.py 218): INFO Train: [111/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2483 (0.2504) loss 0.3683 (0.3593) grad_norm 152272.5156 (inf) mem 14543MB +[2023-10-12 08:10:27 simmim_pretrain](main_simmim.py 218): INFO Train: [111/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2547 (0.2508) loss 0.3408 (0.3595) grad_norm 132867.4844 (inf) mem 14543MB +[2023-10-12 08:11:41 simmim_pretrain](main_simmim.py 228): INFO EPOCH 111 training takes 0:28:24 +[2023-10-12 08:11:42 simmim_pretrain](main_simmim.py 218): INFO Train: [112/200][0/6787] eta 2:56:52 lr 0.000200 time 1.5637 (1.5637) loss 0.3633 (0.3633) grad_norm 182611.2969 (182611.2969) mem 14543MB +[2023-10-12 08:13:48 simmim_pretrain](main_simmim.py 218): INFO Train: [112/200][500/6787] eta 0:26:30 lr 0.000200 time 0.2501 (0.2530) loss 0.3850 (0.3622) grad_norm 93117.5859 (130352.5156) mem 14543MB +[2023-10-12 08:15:53 simmim_pretrain](main_simmim.py 218): INFO Train: [112/200][1000/6787] eta 0:24:16 lr 0.000200 time 0.2466 (0.2517) loss 0.3696 (0.3615) grad_norm 152613.9219 (145454.0000) mem 14543MB +[2023-10-12 08:17:58 simmim_pretrain](main_simmim.py 218): INFO Train: [112/200][1500/6787] eta 0:22:08 lr 0.000200 time 0.2592 (0.2512) loss 0.3352 (0.3608) grad_norm 114691.8047 (161896.9688) mem 14543MB +[2023-10-12 08:20:03 simmim_pretrain](main_simmim.py 218): INFO Train: [112/200][2000/6787] eta 0:20:01 lr 0.000200 time 0.2521 (0.2510) loss 0.3617 (0.3604) grad_norm 90712.2188 (173063.3438) mem 14543MB +[2023-10-12 08:22:08 simmim_pretrain](main_simmim.py 218): INFO Train: [112/200][2500/6787] eta 0:17:55 lr 0.000200 time 0.2506 (0.2509) loss 0.3653 (0.3605) grad_norm 216577.2812 (183540.7500) mem 14543MB +[2023-10-12 08:24:13 simmim_pretrain](main_simmim.py 218): INFO Train: [112/200][3000/6787] eta 0:15:49 lr 0.000200 time 0.2542 (0.2508) loss 0.3580 (0.3600) grad_norm 311077.3438 (207840.5312) mem 14543MB +[2023-10-12 08:26:19 simmim_pretrain](main_simmim.py 218): INFO Train: [112/200][3500/6787] eta 0:13:44 lr 0.000200 time 0.2507 (0.2507) loss 0.3530 (0.3599) grad_norm 330789.3125 (inf) mem 14543MB +[2023-10-12 08:28:24 simmim_pretrain](main_simmim.py 218): INFO Train: [112/200][4000/6787] eta 0:11:38 lr 0.000200 time 0.2506 (0.2506) loss 0.3390 (0.3600) grad_norm 336085.7500 (inf) mem 14543MB +[2023-10-12 08:30:29 simmim_pretrain](main_simmim.py 218): INFO Train: [112/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2491 (0.2506) loss 0.3313 (0.3600) grad_norm 129028.0703 (inf) mem 14543MB +[2023-10-12 08:32:34 simmim_pretrain](main_simmim.py 218): INFO Train: [112/200][5000/6787] eta 0:07:27 lr 0.000200 time 0.2459 (0.2505) loss 0.3515 (0.3600) grad_norm 310883.1250 (inf) mem 14543MB +[2023-10-12 08:34:39 simmim_pretrain](main_simmim.py 218): INFO Train: [112/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2533 (0.2505) loss 0.3432 (0.3598) grad_norm 216105.2969 (inf) mem 14543MB +[2023-10-12 08:36:44 simmim_pretrain](main_simmim.py 218): INFO Train: [112/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2479 (0.2504) loss 0.3561 (0.3597) grad_norm 839788.4375 (inf) mem 14543MB +[2023-10-12 08:38:49 simmim_pretrain](main_simmim.py 218): INFO Train: [112/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2493 (0.2504) loss 0.3780 (0.3595) grad_norm 408737.6875 (inf) mem 14543MB +[2023-10-12 08:40:01 simmim_pretrain](main_simmim.py 228): INFO EPOCH 112 training takes 0:28:20 +[2023-10-12 08:40:03 simmim_pretrain](main_simmim.py 218): INFO Train: [113/200][0/6787] eta 2:46:54 lr 0.000200 time 1.4755 (1.4755) loss 0.3513 (0.3513) grad_norm 329901.4375 (329901.4375) mem 14543MB +[2023-10-12 08:42:07 simmim_pretrain](main_simmim.py 218): INFO Train: [113/200][500/6787] eta 0:26:24 lr 0.000200 time 0.2505 (0.2521) loss 0.3721 (0.3577) grad_norm 577443.4375 (inf) mem 14543MB +[2023-10-12 08:44:13 simmim_pretrain](main_simmim.py 218): INFO Train: [113/200][1000/6787] eta 0:24:13 lr 0.000200 time 0.2463 (0.2511) loss 0.3770 (0.3599) grad_norm 129677.6641 (inf) mem 14543MB +[2023-10-12 08:46:18 simmim_pretrain](main_simmim.py 218): INFO Train: [113/200][1500/6787] eta 0:22:05 lr 0.000200 time 0.2517 (0.2508) loss 0.3528 (0.3611) grad_norm 61757.3047 (inf) mem 14543MB +[2023-10-12 08:48:23 simmim_pretrain](main_simmim.py 218): INFO Train: [113/200][2000/6787] eta 0:19:59 lr 0.000200 time 0.2489 (0.2507) loss 0.3716 (0.3611) grad_norm 154113.6406 (inf) mem 14543MB +[2023-10-12 08:50:28 simmim_pretrain](main_simmim.py 218): INFO Train: [113/200][2500/6787] eta 0:17:54 lr 0.000200 time 0.2496 (0.2506) loss 0.3638 (0.3614) grad_norm 170958.3594 (inf) mem 14543MB +[2023-10-12 08:52:33 simmim_pretrain](main_simmim.py 218): INFO Train: [113/200][3000/6787] eta 0:15:48 lr 0.000200 time 0.2486 (0.2506) loss 0.3563 (0.3614) grad_norm 117938.3750 (inf) mem 14543MB +[2023-10-12 08:54:38 simmim_pretrain](main_simmim.py 218): INFO Train: [113/200][3500/6787] eta 0:13:43 lr 0.000200 time 0.2487 (0.2506) loss 0.3387 (0.3612) grad_norm 274557.0312 (inf) mem 14543MB +[2023-10-12 08:56:44 simmim_pretrain](main_simmim.py 218): INFO Train: [113/200][4000/6787] eta 0:11:38 lr 0.000200 time 0.2486 (0.2506) loss 0.3815 (0.3611) grad_norm 223480.6719 (inf) mem 14543MB +[2023-10-12 08:58:49 simmim_pretrain](main_simmim.py 218): INFO Train: [113/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2507 (0.2506) loss 0.3564 (0.3609) grad_norm 241623.5938 (inf) mem 14543MB +[2023-10-12 09:00:55 simmim_pretrain](main_simmim.py 218): INFO Train: [113/200][5000/6787] eta 0:07:27 lr 0.000200 time 0.2496 (0.2507) loss 0.3597 (0.3607) grad_norm 591766.6250 (inf) mem 14543MB +[2023-10-12 09:03:00 simmim_pretrain](main_simmim.py 218): INFO Train: [113/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2506 (0.2507) loss 0.3603 (0.3604) grad_norm 419936.9688 (inf) mem 14543MB +[2023-10-12 09:05:05 simmim_pretrain](main_simmim.py 218): INFO Train: [113/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2463 (0.2506) loss 0.3568 (0.3601) grad_norm 455009.7812 (inf) mem 14543MB +[2023-10-12 09:07:11 simmim_pretrain](main_simmim.py 218): INFO Train: [113/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2506 (0.2507) loss 0.3535 (0.3600) grad_norm 244303.5000 (inf) mem 14543MB +[2023-10-12 09:08:23 simmim_pretrain](main_simmim.py 228): INFO EPOCH 113 training takes 0:28:21 +[2023-10-12 09:08:25 simmim_pretrain](main_simmim.py 218): INFO Train: [114/200][0/6787] eta 2:50:50 lr 0.000200 time 1.5103 (1.5103) loss 0.3538 (0.3538) grad_norm 175280.5000 (175280.5000) mem 14543MB +[2023-10-12 09:10:30 simmim_pretrain](main_simmim.py 218): INFO Train: [114/200][500/6787] eta 0:26:27 lr 0.000200 time 0.2485 (0.2526) loss 0.3495 (0.3581) grad_norm 204833.8125 (266115.7500) mem 14543MB +[2023-10-12 09:12:35 simmim_pretrain](main_simmim.py 218): INFO Train: [114/200][1000/6787] eta 0:24:14 lr 0.000200 time 0.2517 (0.2514) loss 0.3674 (0.3593) grad_norm 240993.4375 (254562.2188) mem 14543MB +[2023-10-12 09:14:40 simmim_pretrain](main_simmim.py 218): INFO Train: [114/200][1500/6787] eta 0:22:07 lr 0.000200 time 0.2472 (0.2511) loss 0.3519 (0.3593) grad_norm 103221.5703 (249725.1094) mem 14543MB +[2023-10-12 09:16:45 simmim_pretrain](main_simmim.py 218): INFO Train: [114/200][2000/6787] eta 0:20:01 lr 0.000200 time 0.2594 (0.2510) loss 0.3436 (0.3593) grad_norm 248129.5156 (261657.9531) mem 14543MB +[2023-10-12 09:18:50 simmim_pretrain](main_simmim.py 218): INFO Train: [114/200][2500/6787] eta 0:17:55 lr 0.000200 time 0.2465 (0.2508) loss 0.3801 (0.3589) grad_norm 820611.7500 (278469.7812) mem 14543MB +[2023-10-12 09:20:55 simmim_pretrain](main_simmim.py 218): INFO Train: [114/200][3000/6787] eta 0:15:49 lr 0.000200 time 0.2513 (0.2506) loss 0.3701 (0.3589) grad_norm 341962.5625 (296981.6250) mem 14543MB +[2023-10-12 09:23:00 simmim_pretrain](main_simmim.py 218): INFO Train: [114/200][3500/6787] eta 0:13:43 lr 0.000200 time 0.2463 (0.2506) loss 0.3522 (0.3588) grad_norm 250905.0156 (312322.2812) mem 14543MB +[2023-10-12 09:25:05 simmim_pretrain](main_simmim.py 218): INFO Train: [114/200][4000/6787] eta 0:11:38 lr 0.000200 time 0.2475 (0.2505) loss 0.3636 (0.3587) grad_norm 535936.0625 (inf) mem 14543MB +[2023-10-12 09:27:11 simmim_pretrain](main_simmim.py 218): INFO Train: [114/200][4500/6787] eta 0:09:32 lr 0.000200 time 0.2470 (0.2505) loss 0.3678 (0.3587) grad_norm 444966.5625 (inf) mem 14543MB +[2023-10-12 09:29:16 simmim_pretrain](main_simmim.py 218): INFO Train: [114/200][5000/6787] eta 0:07:27 lr 0.000200 time 0.2496 (0.2505) loss 0.3703 (0.3588) grad_norm 322821.4062 (inf) mem 14543MB +[2023-10-12 09:31:21 simmim_pretrain](main_simmim.py 218): INFO Train: [114/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2487 (0.2504) loss 0.3483 (0.3589) grad_norm 278766.9375 (inf) mem 14543MB +[2023-10-12 09:33:26 simmim_pretrain](main_simmim.py 218): INFO Train: [114/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2548 (0.2504) loss 0.3624 (0.3589) grad_norm 239693.4688 (inf) mem 14543MB +[2023-10-12 09:35:31 simmim_pretrain](main_simmim.py 218): INFO Train: [114/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2540 (0.2504) loss 0.3706 (0.3590) grad_norm 258044.2656 (inf) mem 14543MB +[2023-10-12 09:36:43 simmim_pretrain](main_simmim.py 228): INFO EPOCH 114 training takes 0:28:19 +[2023-10-12 09:36:44 simmim_pretrain](main_simmim.py 218): INFO Train: [115/200][0/6787] eta 2:36:17 lr 0.000200 time 1.3818 (1.3818) loss 0.3531 (0.3531) grad_norm 210865.9688 (210865.9688) mem 14543MB +[2023-10-12 09:38:49 simmim_pretrain](main_simmim.py 218): INFO Train: [115/200][500/6787] eta 0:26:27 lr 0.000200 time 0.2487 (0.2525) loss 0.3548 (0.3584) grad_norm 339419.5625 (376496.5938) mem 14543MB +[2023-10-12 09:40:55 simmim_pretrain](main_simmim.py 218): INFO Train: [115/200][1000/6787] eta 0:24:15 lr 0.000200 time 0.2588 (0.2515) loss 0.3604 (0.3590) grad_norm 173606.2500 (inf) mem 14543MB +[2023-10-12 09:43:00 simmim_pretrain](main_simmim.py 218): INFO Train: [115/200][1500/6787] eta 0:22:08 lr 0.000200 time 0.2502 (0.2512) loss 0.3744 (0.3599) grad_norm 266474.3125 (inf) mem 14543MB +[2023-10-12 09:45:06 simmim_pretrain](main_simmim.py 218): INFO Train: [115/200][2000/6787] eta 0:20:02 lr 0.000200 time 0.2493 (0.2513) loss 0.3431 (0.3601) grad_norm 127611.3594 (inf) mem 14543MB +[2023-10-12 09:47:11 simmim_pretrain](main_simmim.py 218): INFO Train: [115/200][2500/6787] eta 0:17:56 lr 0.000200 time 0.2473 (0.2512) loss 0.3712 (0.3600) grad_norm 246727.9531 (inf) mem 14543MB +[2023-10-12 09:49:17 simmim_pretrain](main_simmim.py 218): INFO Train: [115/200][3000/6787] eta 0:15:51 lr 0.000200 time 0.2464 (0.2512) loss 0.3631 (0.3599) grad_norm 286571.0938 (inf) mem 14543MB +[2023-10-12 09:51:23 simmim_pretrain](main_simmim.py 218): INFO Train: [115/200][3500/6787] eta 0:13:46 lr 0.000200 time 0.2490 (0.2513) loss 0.3531 (0.3596) grad_norm 224341.6406 (inf) mem 14543MB +[2023-10-12 09:53:29 simmim_pretrain](main_simmim.py 218): INFO Train: [115/200][4000/6787] eta 0:11:40 lr 0.000200 time 0.2512 (0.2514) loss 0.3784 (0.3595) grad_norm 241142.0156 (inf) mem 14543MB +[2023-10-12 09:55:35 simmim_pretrain](main_simmim.py 218): INFO Train: [115/200][4500/6787] eta 0:09:35 lr 0.000200 time 0.2588 (0.2514) loss 0.3394 (0.3595) grad_norm 310090.0625 (inf) mem 14543MB +[2023-10-12 09:57:42 simmim_pretrain](main_simmim.py 218): INFO Train: [115/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2558 (0.2518) loss 0.3614 (0.3595) grad_norm 313936.4375 (inf) mem 14543MB +[2023-10-12 09:59:50 simmim_pretrain](main_simmim.py 218): INFO Train: [115/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2555 (0.2522) loss 0.3758 (0.3594) grad_norm 171392.4375 (inf) mem 14543MB +[2023-10-12 10:01:58 simmim_pretrain](main_simmim.py 218): INFO Train: [115/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2563 (0.2525) loss 0.3683 (0.3595) grad_norm 220458.1562 (inf) mem 14543MB +[2023-10-12 10:04:06 simmim_pretrain](main_simmim.py 218): INFO Train: [115/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2566 (0.2528) loss 0.3747 (0.3595) grad_norm 247485.9375 (inf) mem 14543MB +[2023-10-12 10:05:20 simmim_pretrain](main_simmim.py 228): INFO EPOCH 115 training takes 0:28:37 +[2023-10-12 10:05:21 simmim_pretrain](main_simmim.py 218): INFO Train: [116/200][0/6787] eta 2:38:58 lr 0.000200 time 1.4054 (1.4054) loss 0.3591 (0.3591) grad_norm 340657.4062 (340657.4062) mem 14543MB +[2023-10-12 10:07:27 simmim_pretrain](main_simmim.py 218): INFO Train: [116/200][500/6787] eta 0:26:30 lr 0.000200 time 0.2597 (0.2530) loss 0.3768 (0.3615) grad_norm 166567.3438 (inf) mem 14543MB +[2023-10-12 10:09:33 simmim_pretrain](main_simmim.py 218): INFO Train: [116/200][1000/6787] eta 0:24:19 lr 0.000200 time 0.2516 (0.2523) loss 0.3670 (0.3624) grad_norm 170281.6406 (inf) mem 14543MB +[2023-10-12 10:11:38 simmim_pretrain](main_simmim.py 218): INFO Train: [116/200][1500/6787] eta 0:22:11 lr 0.000200 time 0.2476 (0.2518) loss 0.3635 (0.3622) grad_norm 142595.0156 (inf) mem 14543MB +[2023-10-12 10:13:43 simmim_pretrain](main_simmim.py 218): INFO Train: [116/200][2000/6787] eta 0:20:03 lr 0.000200 time 0.2530 (0.2515) loss 0.3620 (0.3622) grad_norm 124172.9609 (inf) mem 14543MB +[2023-10-12 10:15:49 simmim_pretrain](main_simmim.py 218): INFO Train: [116/200][2500/6787] eta 0:17:58 lr 0.000200 time 0.2495 (0.2515) loss 0.3739 (0.3623) grad_norm 115802.4453 (inf) mem 14543MB +[2023-10-12 10:17:55 simmim_pretrain](main_simmim.py 218): INFO Train: [116/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2479 (0.2514) loss 0.3479 (0.3618) grad_norm 215588.7344 (inf) mem 14543MB +[2023-10-12 10:20:00 simmim_pretrain](main_simmim.py 218): INFO Train: [116/200][3500/6787] eta 0:13:46 lr 0.000200 time 0.2540 (0.2514) loss 0.3762 (0.3617) grad_norm 253147.1562 (inf) mem 14543MB +[2023-10-12 10:22:06 simmim_pretrain](main_simmim.py 218): INFO Train: [116/200][4000/6787] eta 0:11:40 lr 0.000200 time 0.2518 (0.2513) loss 0.3870 (0.3618) grad_norm 122639.5234 (inf) mem 14543MB +[2023-10-12 10:24:11 simmim_pretrain](main_simmim.py 218): INFO Train: [116/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2492 (0.2513) loss 0.3557 (0.3617) grad_norm 148606.2188 (inf) mem 14543MB +[2023-10-12 10:26:17 simmim_pretrain](main_simmim.py 218): INFO Train: [116/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2515 (0.2512) loss 0.3549 (0.3618) grad_norm 155689.3438 (inf) mem 14543MB +[2023-10-12 10:28:22 simmim_pretrain](main_simmim.py 218): INFO Train: [116/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2540 (0.2512) loss 0.3799 (0.3617) grad_norm 104737.2344 (inf) mem 14543MB +[2023-10-12 10:30:28 simmim_pretrain](main_simmim.py 218): INFO Train: [116/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2472 (0.2512) loss 0.3333 (0.3615) grad_norm 235232.8906 (inf) mem 14543MB +[2023-10-12 10:32:33 simmim_pretrain](main_simmim.py 218): INFO Train: [116/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2526 (0.2512) loss 0.3733 (0.3613) grad_norm 281626.9062 (inf) mem 14543MB +[2023-10-12 10:33:46 simmim_pretrain](main_simmim.py 228): INFO EPOCH 116 training takes 0:28:25 +[2023-10-12 10:33:47 simmim_pretrain](main_simmim.py 218): INFO Train: [117/200][0/6787] eta 2:31:54 lr 0.000200 time 1.3429 (1.3429) loss 0.3385 (0.3385) grad_norm 253518.2500 (253518.2500) mem 14543MB +[2023-10-12 10:35:52 simmim_pretrain](main_simmim.py 218): INFO Train: [117/200][500/6787] eta 0:26:26 lr 0.000200 time 0.2585 (0.2523) loss 0.3521 (0.3591) grad_norm 189912.4062 (250410.8281) mem 14543MB +[2023-10-12 10:37:57 simmim_pretrain](main_simmim.py 218): INFO Train: [117/200][1000/6787] eta 0:24:15 lr 0.000200 time 0.2514 (0.2515) loss 0.3599 (0.3590) grad_norm 323083.3438 (276954.4688) mem 14543MB +[2023-10-12 10:40:03 simmim_pretrain](main_simmim.py 218): INFO Train: [117/200][1500/6787] eta 0:22:08 lr 0.000200 time 0.2512 (0.2513) loss 0.3668 (0.3588) grad_norm 165663.0469 (311856.0000) mem 14543MB +[2023-10-12 10:42:08 simmim_pretrain](main_simmim.py 218): INFO Train: [117/200][2000/6787] eta 0:20:02 lr 0.000200 time 0.2520 (0.2512) loss 0.3472 (0.3586) grad_norm 852972.3750 (329970.4688) mem 14543MB +[2023-10-12 10:44:14 simmim_pretrain](main_simmim.py 218): INFO Train: [117/200][2500/6787] eta 0:17:56 lr 0.000200 time 0.2463 (0.2511) loss 0.3635 (0.3586) grad_norm 977080.9375 (368667.2812) mem 14543MB +[2023-10-12 10:46:19 simmim_pretrain](main_simmim.py 218): INFO Train: [117/200][3000/6787] eta 0:15:51 lr 0.000200 time 0.2490 (0.2512) loss 0.3573 (0.3586) grad_norm 283307.1875 (inf) mem 14543MB +[2023-10-12 10:48:25 simmim_pretrain](main_simmim.py 218): INFO Train: [117/200][3500/6787] eta 0:13:45 lr 0.000200 time 0.2469 (0.2511) loss 0.3647 (0.3586) grad_norm 178685.5312 (inf) mem 14543MB +[2023-10-12 10:50:30 simmim_pretrain](main_simmim.py 218): INFO Train: [117/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2589 (0.2511) loss 0.3789 (0.3587) grad_norm 321773.3438 (inf) mem 14543MB +[2023-10-12 10:52:36 simmim_pretrain](main_simmim.py 218): INFO Train: [117/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2501 (0.2511) loss 0.3397 (0.3587) grad_norm 231608.6875 (inf) mem 14543MB +[2023-10-12 10:54:41 simmim_pretrain](main_simmim.py 218): INFO Train: [117/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2486 (0.2511) loss 0.3819 (0.3586) grad_norm 189594.5781 (inf) mem 14543MB +[2023-10-12 10:56:47 simmim_pretrain](main_simmim.py 218): INFO Train: [117/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2485 (0.2511) loss 0.3445 (0.3586) grad_norm 456327.1562 (inf) mem 14543MB +[2023-10-12 10:58:52 simmim_pretrain](main_simmim.py 218): INFO Train: [117/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2463 (0.2511) loss 0.3688 (0.3586) grad_norm 246278.5000 (inf) mem 14543MB +[2023-10-12 11:00:58 simmim_pretrain](main_simmim.py 218): INFO Train: [117/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2550 (0.2510) loss 0.3754 (0.3586) grad_norm 350821.0312 (inf) mem 14543MB +[2023-10-12 11:02:10 simmim_pretrain](main_simmim.py 228): INFO EPOCH 117 training takes 0:28:24 +[2023-10-12 11:02:12 simmim_pretrain](main_simmim.py 218): INFO Train: [118/200][0/6787] eta 2:52:47 lr 0.000200 time 1.5276 (1.5276) loss 0.3641 (0.3641) grad_norm 383179.1875 (383179.1875) mem 14543MB +[2023-10-12 11:04:17 simmim_pretrain](main_simmim.py 218): INFO Train: [118/200][500/6787] eta 0:26:29 lr 0.000200 time 0.2500 (0.2528) loss 0.3596 (0.3575) grad_norm 445160.5000 (452606.3125) mem 14543MB +[2023-10-12 11:06:22 simmim_pretrain](main_simmim.py 218): INFO Train: [118/200][1000/6787] eta 0:24:16 lr 0.000200 time 0.2502 (0.2517) loss 0.3585 (0.3570) grad_norm 316984.0625 (456598.5938) mem 14543MB +[2023-10-12 11:08:27 simmim_pretrain](main_simmim.py 218): INFO Train: [118/200][1500/6787] eta 0:22:08 lr 0.000200 time 0.2469 (0.2514) loss 0.3399 (0.3572) grad_norm 541144.8125 (450720.5312) mem 14543MB +[2023-10-12 11:10:33 simmim_pretrain](main_simmim.py 218): INFO Train: [118/200][2000/6787] eta 0:20:02 lr 0.000200 time 0.2526 (0.2512) loss 0.3322 (0.3575) grad_norm 436935.2812 (inf) mem 14543MB +[2023-10-12 11:12:38 simmim_pretrain](main_simmim.py 218): INFO Train: [118/200][2500/6787] eta 0:17:56 lr 0.000200 time 0.2457 (0.2511) loss 0.3800 (0.3581) grad_norm 221044.7031 (inf) mem 14543MB +[2023-10-12 11:14:43 simmim_pretrain](main_simmim.py 218): INFO Train: [118/200][3000/6787] eta 0:15:50 lr 0.000200 time 0.2469 (0.2510) loss 0.3626 (0.3584) grad_norm 279021.7188 (inf) mem 14543MB +[2023-10-12 11:16:49 simmim_pretrain](main_simmim.py 218): INFO Train: [118/200][3500/6787] eta 0:13:44 lr 0.000200 time 0.2538 (0.2510) loss 0.3774 (0.3586) grad_norm 280337.2188 (inf) mem 14543MB +[2023-10-12 11:18:54 simmim_pretrain](main_simmim.py 218): INFO Train: [118/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2460 (0.2509) loss 0.3675 (0.3588) grad_norm 351947.3125 (inf) mem 14543MB +[2023-10-12 11:20:59 simmim_pretrain](main_simmim.py 218): INFO Train: [118/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2465 (0.2508) loss 0.3515 (0.3589) grad_norm 241084.0781 (inf) mem 14543MB +[2023-10-12 11:23:04 simmim_pretrain](main_simmim.py 218): INFO Train: [118/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2532 (0.2508) loss 0.3487 (0.3588) grad_norm 578943.7500 (inf) mem 14543MB +[2023-10-12 11:25:09 simmim_pretrain](main_simmim.py 218): INFO Train: [118/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2462 (0.2507) loss 0.3833 (0.3588) grad_norm 547064.3125 (inf) mem 14543MB +[2023-10-12 11:27:15 simmim_pretrain](main_simmim.py 218): INFO Train: [118/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2509 (0.2507) loss 0.3533 (0.3588) grad_norm 372251.4062 (inf) mem 14543MB +[2023-10-12 11:29:20 simmim_pretrain](main_simmim.py 218): INFO Train: [118/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2539 (0.2507) loss 0.3623 (0.3588) grad_norm 190820.9062 (inf) mem 14543MB +[2023-10-12 11:30:32 simmim_pretrain](main_simmim.py 228): INFO EPOCH 118 training takes 0:28:22 +[2023-10-12 11:30:34 simmim_pretrain](main_simmim.py 218): INFO Train: [119/200][0/6787] eta 2:46:24 lr 0.000200 time 1.4712 (1.4712) loss 0.3501 (0.3501) grad_norm 307389.3750 (307389.3750) mem 14543MB +[2023-10-12 11:32:39 simmim_pretrain](main_simmim.py 218): INFO Train: [119/200][500/6787] eta 0:26:30 lr 0.000200 time 0.2554 (0.2529) loss 0.3696 (0.3601) grad_norm 200718.2656 (252041.5938) mem 14543MB +[2023-10-12 11:34:45 simmim_pretrain](main_simmim.py 218): INFO Train: [119/200][1000/6787] eta 0:24:18 lr 0.000200 time 0.2514 (0.2520) loss 0.3542 (0.3601) grad_norm 283518.0625 (inf) mem 14543MB +[2023-10-12 11:36:50 simmim_pretrain](main_simmim.py 218): INFO Train: [119/200][1500/6787] eta 0:22:10 lr 0.000200 time 0.2525 (0.2517) loss 0.3896 (0.3597) grad_norm 386525.5000 (inf) mem 14543MB +[2023-10-12 11:38:56 simmim_pretrain](main_simmim.py 218): INFO Train: [119/200][2000/6787] eta 0:20:04 lr 0.000200 time 0.2581 (0.2516) loss 0.3781 (0.3598) grad_norm 329257.7188 (inf) mem 14543MB +[2023-10-12 11:41:02 simmim_pretrain](main_simmim.py 218): INFO Train: [119/200][2500/6787] eta 0:17:58 lr 0.000200 time 0.2525 (0.2516) loss 0.3810 (0.3598) grad_norm 271186.5312 (inf) mem 14543MB +[2023-10-12 11:43:07 simmim_pretrain](main_simmim.py 218): INFO Train: [119/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2516 (0.2516) loss 0.3477 (0.3604) grad_norm 174199.8125 (inf) mem 14543MB +[2023-10-12 11:45:13 simmim_pretrain](main_simmim.py 218): INFO Train: [119/200][3500/6787] eta 0:13:47 lr 0.000200 time 0.2486 (0.2517) loss 0.3583 (0.3607) grad_norm 129420.9297 (inf) mem 14543MB +[2023-10-12 11:47:19 simmim_pretrain](main_simmim.py 218): INFO Train: [119/200][4000/6787] eta 0:11:41 lr 0.000200 time 0.2491 (0.2517) loss 0.3555 (0.3609) grad_norm 104844.6250 (inf) mem 14543MB +[2023-10-12 11:49:25 simmim_pretrain](main_simmim.py 218): INFO Train: [119/200][4500/6787] eta 0:09:35 lr 0.000200 time 0.2495 (0.2517) loss 0.3691 (0.3610) grad_norm 201065.5781 (inf) mem 14543MB +[2023-10-12 11:51:31 simmim_pretrain](main_simmim.py 218): INFO Train: [119/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2507 (0.2516) loss 0.3626 (0.3609) grad_norm 162001.1875 (inf) mem 14543MB +[2023-10-12 11:53:36 simmim_pretrain](main_simmim.py 218): INFO Train: [119/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2529 (0.2516) loss 0.3536 (0.3609) grad_norm 192356.4375 (inf) mem 14543MB +[2023-10-12 11:55:42 simmim_pretrain](main_simmim.py 218): INFO Train: [119/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2504 (0.2516) loss 0.3830 (0.3607) grad_norm 259239.2812 (inf) mem 14543MB +[2023-10-12 11:57:48 simmim_pretrain](main_simmim.py 218): INFO Train: [119/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2496 (0.2516) loss 0.3366 (0.3607) grad_norm 240705.6719 (inf) mem 14543MB +[2023-10-12 11:59:00 simmim_pretrain](main_simmim.py 228): INFO EPOCH 119 training takes 0:28:28 +[2023-10-12 11:59:02 simmim_pretrain](main_simmim.py 218): INFO Train: [120/200][0/6787] eta 2:52:51 lr 0.000200 time 1.5281 (1.5281) loss 0.3496 (0.3496) grad_norm 231375.3125 (231375.3125) mem 14543MB +[2023-10-12 12:01:07 simmim_pretrain](main_simmim.py 218): INFO Train: [120/200][500/6787] eta 0:26:27 lr 0.000200 time 0.2546 (0.2526) loss 0.3425 (0.3579) grad_norm 423830.3750 (328276.5312) mem 14543MB +[2023-10-12 12:03:12 simmim_pretrain](main_simmim.py 218): INFO Train: [120/200][1000/6787] eta 0:24:16 lr 0.000200 time 0.2579 (0.2516) loss 0.3797 (0.3581) grad_norm 281724.2500 (inf) mem 14543MB +[2023-10-12 12:05:17 simmim_pretrain](main_simmim.py 218): INFO Train: [120/200][1500/6787] eta 0:22:08 lr 0.000200 time 0.2510 (0.2513) loss 0.3384 (0.3586) grad_norm 320460.1250 (inf) mem 14543MB +[2023-10-12 12:07:23 simmim_pretrain](main_simmim.py 218): INFO Train: [120/200][2000/6787] eta 0:20:01 lr 0.000200 time 0.2488 (0.2510) loss 0.3446 (0.3588) grad_norm 195391.7656 (inf) mem 14543MB +[2023-10-12 12:09:28 simmim_pretrain](main_simmim.py 218): INFO Train: [120/200][2500/6787] eta 0:17:55 lr 0.000200 time 0.2524 (0.2509) loss 0.3684 (0.3588) grad_norm 117798.2656 (inf) mem 14543MB +[2023-10-12 12:11:33 simmim_pretrain](main_simmim.py 218): INFO Train: [120/200][3000/6787] eta 0:15:50 lr 0.000200 time 0.2523 (0.2509) loss 0.3596 (0.3590) grad_norm 382798.8125 (inf) mem 14543MB +[2023-10-12 12:13:38 simmim_pretrain](main_simmim.py 218): INFO Train: [120/200][3500/6787] eta 0:13:44 lr 0.000200 time 0.2521 (0.2508) loss 0.3576 (0.3591) grad_norm 227173.5000 (inf) mem 14543MB +[2023-10-12 12:15:44 simmim_pretrain](main_simmim.py 218): INFO Train: [120/200][4000/6787] eta 0:11:38 lr 0.000200 time 0.2460 (0.2508) loss 0.5095 (0.3653) grad_norm 12666.9590 (inf) mem 14543MB +[2023-10-12 12:17:49 simmim_pretrain](main_simmim.py 218): INFO Train: [120/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2538 (0.2507) loss 0.3875 (0.3766) grad_norm 30068.6680 (inf) mem 14543MB +[2023-10-12 12:19:54 simmim_pretrain](main_simmim.py 218): INFO Train: [120/200][5000/6787] eta 0:07:27 lr 0.000200 time 0.2462 (0.2506) loss 0.3677 (0.3764) grad_norm 39233.9609 (inf) mem 14543MB +[2023-10-12 12:21:59 simmim_pretrain](main_simmim.py 218): INFO Train: [120/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2486 (0.2506) loss 0.3702 (0.3756) grad_norm 36601.4844 (inf) mem 14543MB +[2023-10-12 12:24:04 simmim_pretrain](main_simmim.py 218): INFO Train: [120/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2461 (0.2505) loss 0.3816 (0.3748) grad_norm 27358.3164 (inf) mem 14543MB +[2023-10-12 12:26:09 simmim_pretrain](main_simmim.py 218): INFO Train: [120/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2507 (0.2505) loss 0.3597 (0.3739) grad_norm 42402.3398 (inf) mem 14543MB +[2023-10-12 12:27:21 simmim_pretrain](main_simmim.py 228): INFO EPOCH 120 training takes 0:28:20 +[2023-10-12 12:27:21 simmim_pretrain](utils.py 62): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_120.pth saving...... +[2023-10-12 12:27:22 simmim_pretrain](utils.py 64): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_120.pth saved !!! +[2023-10-12 12:27:23 simmim_pretrain](main_simmim.py 218): INFO Train: [121/200][0/6787] eta 2:27:56 lr 0.000200 time 1.3078 (1.3078) loss 0.3718 (0.3718) grad_norm 71027.0547 (71027.0547) mem 14543MB +[2023-10-12 12:29:28 simmim_pretrain](main_simmim.py 218): INFO Train: [121/200][500/6787] eta 0:26:21 lr 0.000200 time 0.2463 (0.2516) loss 0.3743 (0.3628) grad_norm 38354.6367 (48843.0352) mem 14543MB +[2023-10-12 12:31:33 simmim_pretrain](main_simmim.py 218): INFO Train: [121/200][1000/6787] eta 0:24:11 lr 0.000200 time 0.2471 (0.2508) loss 0.3620 (0.3627) grad_norm 44278.9961 (49520.5820) mem 14543MB +[2023-10-12 12:33:38 simmim_pretrain](main_simmim.py 218): INFO Train: [121/200][1500/6787] eta 0:22:05 lr 0.000200 time 0.2533 (0.2506) loss 0.3683 (0.3624) grad_norm 68510.8594 (54225.5430) mem 14543MB +[2023-10-12 12:35:44 simmim_pretrain](main_simmim.py 218): INFO Train: [121/200][2000/6787] eta 0:20:00 lr 0.000200 time 0.2530 (0.2507) loss 0.3340 (0.3622) grad_norm 63377.1953 (61776.0352) mem 14543MB +[2023-10-12 12:37:49 simmim_pretrain](main_simmim.py 218): INFO Train: [121/200][2500/6787] eta 0:17:55 lr 0.000200 time 0.2497 (0.2508) loss 0.3762 (0.3617) grad_norm 68822.3906 (65922.7656) mem 14543MB +[2023-10-12 12:39:55 simmim_pretrain](main_simmim.py 218): INFO Train: [121/200][3000/6787] eta 0:15:50 lr 0.000200 time 0.2508 (0.2509) loss 0.3453 (0.3616) grad_norm 99822.3438 (71108.1172) mem 14543MB +[2023-10-12 12:42:00 simmim_pretrain](main_simmim.py 218): INFO Train: [121/200][3500/6787] eta 0:13:44 lr 0.000200 time 0.2588 (0.2510) loss 0.3587 (0.3613) grad_norm 169144.6094 (80026.8047) mem 14543MB +[2023-10-12 12:44:06 simmim_pretrain](main_simmim.py 218): INFO Train: [121/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2462 (0.2510) loss 0.3525 (0.3611) grad_norm 337837.2188 (87072.6328) mem 14543MB +[2023-10-12 12:46:11 simmim_pretrain](main_simmim.py 218): INFO Train: [121/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2585 (0.2510) loss 0.3491 (0.3609) grad_norm 115191.3438 (103367.2500) mem 14543MB +[2023-10-12 12:48:17 simmim_pretrain](main_simmim.py 218): INFO Train: [121/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2542 (0.2509) loss 0.3565 (0.3607) grad_norm 123019.7812 (109690.3438) mem 14543MB +[2023-10-12 12:50:22 simmim_pretrain](main_simmim.py 218): INFO Train: [121/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2484 (0.2509) loss 0.3653 (0.3605) grad_norm 459930.1875 (125741.1562) mem 14543MB +[2023-10-12 12:52:27 simmim_pretrain](main_simmim.py 218): INFO Train: [121/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2514 (0.2509) loss 0.3556 (0.3602) grad_norm 138118.7344 (141188.4531) mem 14543MB +[2023-10-12 12:54:32 simmim_pretrain](main_simmim.py 218): INFO Train: [121/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2502 (0.2508) loss 0.3535 (0.3600) grad_norm 175545.0469 (158868.9531) mem 14543MB +[2023-10-12 12:55:45 simmim_pretrain](main_simmim.py 228): INFO EPOCH 121 training takes 0:28:22 +[2023-10-12 12:55:46 simmim_pretrain](main_simmim.py 218): INFO Train: [122/200][0/6787] eta 2:44:05 lr 0.000200 time 1.4506 (1.4506) loss 0.3513 (0.3513) grad_norm 203051.9531 (203051.9531) mem 14543MB +[2023-10-12 12:57:51 simmim_pretrain](main_simmim.py 218): INFO Train: [122/200][500/6787] eta 0:26:27 lr 0.000200 time 0.2579 (0.2525) loss 0.3467 (0.3568) grad_norm 674823.3125 (inf) mem 14543MB +[2023-10-12 12:59:56 simmim_pretrain](main_simmim.py 218): INFO Train: [122/200][1000/6787] eta 0:24:15 lr 0.000200 time 0.2475 (0.2516) loss 0.3458 (0.3574) grad_norm 211274.0625 (inf) mem 14543MB +[2023-10-12 13:02:02 simmim_pretrain](main_simmim.py 218): INFO Train: [122/200][1500/6787] eta 0:22:07 lr 0.000200 time 0.2459 (0.2512) loss 0.3865 (0.3575) grad_norm 364379.0938 (inf) mem 14543MB +[2023-10-12 13:04:07 simmim_pretrain](main_simmim.py 218): INFO Train: [122/200][2000/6787] eta 0:20:02 lr 0.000200 time 0.2589 (0.2511) loss 0.3576 (0.3574) grad_norm 215404.4219 (inf) mem 14543MB +[2023-10-12 13:06:13 simmim_pretrain](main_simmim.py 218): INFO Train: [122/200][2500/6787] eta 0:17:56 lr 0.000200 time 0.2465 (0.2511) loss 0.3417 (0.3576) grad_norm 608808.3125 (inf) mem 14543MB +[2023-10-12 13:08:18 simmim_pretrain](main_simmim.py 218): INFO Train: [122/200][3000/6787] eta 0:15:50 lr 0.000200 time 0.2474 (0.2511) loss 0.3638 (0.3575) grad_norm 402894.5625 (inf) mem 14543MB +[2023-10-12 13:10:24 simmim_pretrain](main_simmim.py 218): INFO Train: [122/200][3500/6787] eta 0:13:45 lr 0.000200 time 0.2608 (0.2510) loss 0.3583 (0.3575) grad_norm 379572.6562 (inf) mem 14543MB +[2023-10-12 13:12:29 simmim_pretrain](main_simmim.py 218): INFO Train: [122/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2524 (0.2511) loss 0.3452 (0.3577) grad_norm 552910.2500 (inf) mem 14543MB +[2023-10-12 13:14:35 simmim_pretrain](main_simmim.py 218): INFO Train: [122/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2541 (0.2511) loss 0.3673 (0.3578) grad_norm 1043585.6875 (inf) mem 14543MB +[2023-10-12 13:16:40 simmim_pretrain](main_simmim.py 218): INFO Train: [122/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2537 (0.2511) loss 0.3662 (0.3576) grad_norm 338792.8125 (inf) mem 14543MB +[2023-10-12 13:18:46 simmim_pretrain](main_simmim.py 218): INFO Train: [122/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2544 (0.2512) loss 0.3466 (0.3577) grad_norm 247120.5781 (inf) mem 14543MB +[2023-10-12 13:20:52 simmim_pretrain](main_simmim.py 218): INFO Train: [122/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2532 (0.2512) loss 0.3596 (0.3578) grad_norm 289515.0000 (inf) mem 14543MB +[2023-10-12 13:23:01 simmim_pretrain](main_simmim.py 218): INFO Train: [122/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2586 (0.2517) loss 0.3573 (0.3579) grad_norm 113469.9141 (inf) mem 14543MB +[2023-10-12 13:24:16 simmim_pretrain](main_simmim.py 228): INFO EPOCH 122 training takes 0:28:31 +[2023-10-12 13:24:17 simmim_pretrain](main_simmim.py 218): INFO Train: [123/200][0/6787] eta 2:31:49 lr 0.000200 time 1.3422 (1.3422) loss 0.3680 (0.3680) grad_norm 118650.3281 (118650.3281) mem 14543MB +[2023-10-12 13:26:22 simmim_pretrain](main_simmim.py 218): INFO Train: [123/200][500/6787] eta 0:26:28 lr 0.000200 time 0.2454 (0.2527) loss 0.3673 (0.3596) grad_norm 211398.9062 (245909.9375) mem 14543MB +[2023-10-12 13:28:28 simmim_pretrain](main_simmim.py 218): INFO Train: [123/200][1000/6787] eta 0:24:18 lr 0.000200 time 0.2510 (0.2520) loss 0.3265 (0.3586) grad_norm 542161.5625 (274465.7500) mem 14543MB +[2023-10-12 13:30:34 simmim_pretrain](main_simmim.py 218): INFO Train: [123/200][1500/6787] eta 0:22:10 lr 0.000200 time 0.2478 (0.2517) loss 0.3558 (0.3584) grad_norm 278594.5625 (293624.6250) mem 14543MB +[2023-10-12 13:32:39 simmim_pretrain](main_simmim.py 218): INFO Train: [123/200][2000/6787] eta 0:20:03 lr 0.000200 time 0.2462 (0.2515) loss 0.3367 (0.3582) grad_norm 137553.5156 (310133.2812) mem 14543MB +[2023-10-12 13:34:44 simmim_pretrain](main_simmim.py 218): INFO Train: [123/200][2500/6787] eta 0:17:57 lr 0.000200 time 0.2466 (0.2513) loss 0.3399 (0.3581) grad_norm 281124.1250 (inf) mem 14543MB +[2023-10-12 13:36:52 simmim_pretrain](main_simmim.py 218): INFO Train: [123/200][3000/6787] eta 0:15:53 lr 0.000200 time 0.2610 (0.2519) loss 0.3598 (0.3580) grad_norm 429134.5000 (inf) mem 14543MB +[2023-10-12 13:39:02 simmim_pretrain](main_simmim.py 218): INFO Train: [123/200][3500/6787] eta 0:13:51 lr 0.000200 time 0.2611 (0.2530) loss 0.3627 (0.3579) grad_norm 350622.9375 (inf) mem 14543MB +[2023-10-12 13:41:12 simmim_pretrain](main_simmim.py 218): INFO Train: [123/200][4000/6787] eta 0:11:47 lr 0.000200 time 0.2608 (0.2539) loss 0.3451 (0.3579) grad_norm 447717.9688 (inf) mem 14543MB +[2023-10-12 13:43:21 simmim_pretrain](main_simmim.py 218): INFO Train: [123/200][4500/6787] eta 0:09:42 lr 0.000200 time 0.2592 (0.2545) loss 0.3622 (0.3582) grad_norm 181476.1094 (inf) mem 14543MB +[2023-10-12 13:45:31 simmim_pretrain](main_simmim.py 218): INFO Train: [123/200][5000/6787] eta 0:07:35 lr 0.000200 time 0.2607 (0.2550) loss 0.3611 (0.3585) grad_norm 122111.7656 (inf) mem 14543MB +[2023-10-12 13:47:41 simmim_pretrain](main_simmim.py 218): INFO Train: [123/200][5500/6787] eta 0:05:28 lr 0.000200 time 0.2611 (0.2555) loss 0.3571 (0.3587) grad_norm 123192.2656 (inf) mem 14543MB +[2023-10-12 13:49:51 simmim_pretrain](main_simmim.py 218): INFO Train: [123/200][6000/6787] eta 0:03:21 lr 0.000200 time 0.2608 (0.2558) loss 0.3611 (0.3589) grad_norm 91571.2344 (inf) mem 14543MB +[2023-10-12 13:52:01 simmim_pretrain](main_simmim.py 218): INFO Train: [123/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2608 (0.2561) loss 0.3452 (0.3590) grad_norm 80135.4453 (inf) mem 14543MB +[2023-10-12 13:53:16 simmim_pretrain](main_simmim.py 228): INFO EPOCH 123 training takes 0:29:00 +[2023-10-12 13:53:17 simmim_pretrain](main_simmim.py 218): INFO Train: [124/200][0/6787] eta 2:34:55 lr 0.000200 time 1.3696 (1.3696) loss 0.3626 (0.3626) grad_norm 156864.5156 (156864.5156) mem 14543MB +[2023-10-12 13:55:23 simmim_pretrain](main_simmim.py 218): INFO Train: [124/200][500/6787] eta 0:26:30 lr 0.000200 time 0.2491 (0.2529) loss 0.3368 (0.3599) grad_norm 267711.9375 (176202.7031) mem 14543MB +[2023-10-12 13:57:28 simmim_pretrain](main_simmim.py 218): INFO Train: [124/200][1000/6787] eta 0:24:17 lr 0.000200 time 0.2503 (0.2518) loss 0.3514 (0.3597) grad_norm 281073.4062 (179148.3594) mem 14543MB +[2023-10-12 13:59:34 simmim_pretrain](main_simmim.py 218): INFO Train: [124/200][1500/6787] eta 0:22:10 lr 0.000200 time 0.2516 (0.2516) loss 0.3491 (0.3592) grad_norm 358753.0000 (189866.9688) mem 14543MB +[2023-10-12 14:01:40 simmim_pretrain](main_simmim.py 218): INFO Train: [124/200][2000/6787] eta 0:20:04 lr 0.000200 time 0.2538 (0.2516) loss 0.3600 (0.3591) grad_norm 228855.5781 (inf) mem 14543MB +[2023-10-12 14:03:45 simmim_pretrain](main_simmim.py 218): INFO Train: [124/200][2500/6787] eta 0:17:58 lr 0.000200 time 0.2501 (0.2515) loss 0.3596 (0.3589) grad_norm 321817.8750 (inf) mem 14543MB +[2023-10-12 14:05:51 simmim_pretrain](main_simmim.py 218): INFO Train: [124/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2576 (0.2516) loss 0.3431 (0.3589) grad_norm 93651.2969 (inf) mem 14543MB +[2023-10-12 14:07:57 simmim_pretrain](main_simmim.py 218): INFO Train: [124/200][3500/6787] eta 0:13:46 lr 0.000200 time 0.2487 (0.2515) loss 0.3771 (0.3588) grad_norm 244422.4219 (inf) mem 14543MB +[2023-10-12 14:10:02 simmim_pretrain](main_simmim.py 218): INFO Train: [124/200][4000/6787] eta 0:11:40 lr 0.000200 time 0.2469 (0.2515) loss 0.3485 (0.3587) grad_norm 178240.1875 (inf) mem 14543MB +[2023-10-12 14:12:08 simmim_pretrain](main_simmim.py 218): INFO Train: [124/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2522 (0.2514) loss 0.3608 (0.3587) grad_norm 686672.7500 (inf) mem 14543MB +[2023-10-12 14:14:13 simmim_pretrain](main_simmim.py 218): INFO Train: [124/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2462 (0.2514) loss 0.3469 (0.3586) grad_norm 214954.1719 (inf) mem 14543MB +[2023-10-12 14:16:19 simmim_pretrain](main_simmim.py 218): INFO Train: [124/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2505 (0.2513) loss 0.3625 (0.3584) grad_norm 390392.6875 (inf) mem 14543MB +[2023-10-12 14:18:24 simmim_pretrain](main_simmim.py 218): INFO Train: [124/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2536 (0.2513) loss 0.3422 (0.3584) grad_norm 432980.9062 (inf) mem 14543MB +[2023-10-12 14:20:30 simmim_pretrain](main_simmim.py 218): INFO Train: [124/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2514 (0.2513) loss 0.3644 (0.3582) grad_norm 849666.8750 (inf) mem 14543MB +[2023-10-12 14:21:43 simmim_pretrain](main_simmim.py 228): INFO EPOCH 124 training takes 0:28:26 +[2023-10-12 14:21:44 simmim_pretrain](main_simmim.py 218): INFO Train: [125/200][0/6787] eta 2:40:32 lr 0.000200 time 1.4193 (1.4193) loss 0.3657 (0.3657) grad_norm 323390.9688 (323390.9688) mem 14543MB +[2023-10-12 14:23:50 simmim_pretrain](main_simmim.py 218): INFO Train: [125/200][500/6787] eta 0:26:38 lr 0.000200 time 0.2490 (0.2543) loss 0.3492 (0.3591) grad_norm 211577.9062 (248741.5781) mem 14543MB +[2023-10-12 14:25:56 simmim_pretrain](main_simmim.py 218): INFO Train: [125/200][1000/6787] eta 0:24:26 lr 0.000200 time 0.2544 (0.2534) loss 0.3683 (0.3593) grad_norm 275156.5312 (240182.2656) mem 14543MB +[2023-10-12 14:28:02 simmim_pretrain](main_simmim.py 218): INFO Train: [125/200][1500/6787] eta 0:22:17 lr 0.000200 time 0.2540 (0.2529) loss 0.3649 (0.3594) grad_norm 152006.5156 (237202.6562) mem 14543MB +[2023-10-12 14:30:08 simmim_pretrain](main_simmim.py 218): INFO Train: [125/200][2000/6787] eta 0:20:09 lr 0.000200 time 0.2519 (0.2526) loss 0.3559 (0.3595) grad_norm 152184.3594 (247293.1094) mem 14543MB +[2023-10-12 14:32:14 simmim_pretrain](main_simmim.py 218): INFO Train: [125/200][2500/6787] eta 0:18:02 lr 0.000200 time 0.2499 (0.2525) loss 0.3462 (0.3592) grad_norm 254810.3906 (263619.9688) mem 14543MB +[2023-10-12 14:34:20 simmim_pretrain](main_simmim.py 218): INFO Train: [125/200][3000/6787] eta 0:15:55 lr 0.000200 time 0.2584 (0.2523) loss 0.3735 (0.3591) grad_norm 461938.6250 (279142.9375) mem 14543MB +[2023-10-12 14:36:26 simmim_pretrain](main_simmim.py 218): INFO Train: [125/200][3500/6787] eta 0:13:49 lr 0.000200 time 0.2493 (0.2523) loss 0.3520 (0.3589) grad_norm 379179.9062 (291557.3750) mem 14543MB +[2023-10-12 14:38:32 simmim_pretrain](main_simmim.py 218): INFO Train: [125/200][4000/6787] eta 0:11:42 lr 0.000200 time 0.2551 (0.2522) loss 0.3813 (0.3587) grad_norm 473566.9062 (inf) mem 14543MB +[2023-10-12 14:40:38 simmim_pretrain](main_simmim.py 218): INFO Train: [125/200][4500/6787] eta 0:09:36 lr 0.000200 time 0.2552 (0.2522) loss 0.3638 (0.3585) grad_norm 403850.5938 (inf) mem 14543MB +[2023-10-12 14:42:44 simmim_pretrain](main_simmim.py 218): INFO Train: [125/200][5000/6787] eta 0:07:30 lr 0.000200 time 0.2515 (0.2522) loss 0.3561 (0.3584) grad_norm 376596.0625 (inf) mem 14543MB +[2023-10-12 14:44:50 simmim_pretrain](main_simmim.py 218): INFO Train: [125/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2502 (0.2522) loss 0.3579 (0.3583) grad_norm 543785.5625 (inf) mem 14543MB +[2023-10-12 14:46:57 simmim_pretrain](main_simmim.py 218): INFO Train: [125/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2566 (0.2523) loss 0.3395 (0.3582) grad_norm 456096.5312 (inf) mem 14543MB +[2023-10-12 14:49:04 simmim_pretrain](main_simmim.py 218): INFO Train: [125/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2530 (0.2525) loss 0.3846 (0.3582) grad_norm 375934.6875 (inf) mem 14543MB +[2023-10-12 14:50:18 simmim_pretrain](main_simmim.py 228): INFO EPOCH 125 training takes 0:28:35 +[2023-10-12 14:50:20 simmim_pretrain](main_simmim.py 218): INFO Train: [126/200][0/6787] eta 2:57:01 lr 0.000200 time 1.5649 (1.5649) loss 0.3423 (0.3423) grad_norm 253777.9531 (253777.9531) mem 14543MB +[2023-10-12 14:52:29 simmim_pretrain](main_simmim.py 218): INFO Train: [126/200][500/6787] eta 0:27:17 lr 0.000200 time 0.2594 (0.2605) loss 0.3569 (0.3570) grad_norm 363599.9375 (430049.2812) mem 14543MB +[2023-10-12 14:54:38 simmim_pretrain](main_simmim.py 218): INFO Train: [126/200][1000/6787] eta 0:25:00 lr 0.000200 time 0.2592 (0.2592) loss 0.3401 (0.3577) grad_norm 390369.9375 (inf) mem 14543MB +[2023-10-12 14:56:46 simmim_pretrain](main_simmim.py 218): INFO Train: [126/200][1500/6787] eta 0:22:46 lr 0.000200 time 0.2496 (0.2585) loss 0.3606 (0.3579) grad_norm 323866.9062 (inf) mem 14543MB +[2023-10-12 14:58:55 simmim_pretrain](main_simmim.py 218): INFO Train: [126/200][2000/6787] eta 0:20:35 lr 0.000200 time 0.2532 (0.2581) loss 0.3431 (0.3581) grad_norm 89422.7266 (inf) mem 14543MB +[2023-10-12 15:01:02 simmim_pretrain](main_simmim.py 218): INFO Train: [126/200][2500/6787] eta 0:18:24 lr 0.000200 time 0.2542 (0.2576) loss 0.3604 (0.3583) grad_norm 316622.4375 (inf) mem 14543MB +[2023-10-12 15:03:10 simmim_pretrain](main_simmim.py 218): INFO Train: [126/200][3000/6787] eta 0:16:14 lr 0.000200 time 0.2533 (0.2572) loss 0.3762 (0.3583) grad_norm 241167.0469 (inf) mem 14543MB +[2023-10-12 15:05:18 simmim_pretrain](main_simmim.py 218): INFO Train: [126/200][3500/6787] eta 0:14:04 lr 0.000200 time 0.2591 (0.2570) loss 0.3535 (0.3583) grad_norm 584936.5000 (inf) mem 14543MB +[2023-10-12 15:07:27 simmim_pretrain](main_simmim.py 218): INFO Train: [126/200][4000/6787] eta 0:11:56 lr 0.000200 time 0.2611 (0.2572) loss 0.3591 (0.3582) grad_norm 597492.1250 (inf) mem 14543MB +[2023-10-12 15:09:36 simmim_pretrain](main_simmim.py 218): INFO Train: [126/200][4500/6787] eta 0:09:48 lr 0.000200 time 0.2560 (0.2572) loss 0.3579 (0.3582) grad_norm 484591.1875 (inf) mem 14543MB +[2023-10-12 15:11:46 simmim_pretrain](main_simmim.py 218): INFO Train: [126/200][5000/6787] eta 0:07:39 lr 0.000200 time 0.2609 (0.2574) loss 0.3429 (0.3582) grad_norm 250718.5625 (inf) mem 14543MB +[2023-10-12 15:13:56 simmim_pretrain](main_simmim.py 218): INFO Train: [126/200][5500/6787] eta 0:05:31 lr 0.000200 time 0.2609 (0.2576) loss 0.4048 (0.3583) grad_norm 147934.5312 (inf) mem 14543MB +[2023-10-12 15:16:06 simmim_pretrain](main_simmim.py 218): INFO Train: [126/200][6000/6787] eta 0:03:22 lr 0.000200 time 0.2609 (0.2578) loss 0.3695 (0.3583) grad_norm 273047.6250 (inf) mem 14543MB +[2023-10-12 15:18:16 simmim_pretrain](main_simmim.py 218): INFO Train: [126/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2611 (0.2580) loss 0.3589 (0.3585) grad_norm 276722.5625 (inf) mem 14543MB +[2023-10-12 15:19:31 simmim_pretrain](main_simmim.py 228): INFO EPOCH 126 training takes 0:29:12 +[2023-10-12 15:19:32 simmim_pretrain](main_simmim.py 218): INFO Train: [127/200][0/6787] eta 2:42:51 lr 0.000200 time 1.4397 (1.4397) loss 0.3766 (0.3766) grad_norm 373395.7812 (373395.7812) mem 14543MB +[2023-10-12 15:21:38 simmim_pretrain](main_simmim.py 218): INFO Train: [127/200][500/6787] eta 0:26:36 lr 0.000200 time 0.2450 (0.2540) loss 0.3488 (0.3574) grad_norm 219675.9375 (344170.9688) mem 14543MB +[2023-10-12 15:23:44 simmim_pretrain](main_simmim.py 218): INFO Train: [127/200][1000/6787] eta 0:24:24 lr 0.000200 time 0.2537 (0.2531) loss 0.3344 (0.3581) grad_norm 192416.8281 (404688.0938) mem 14543MB +[2023-10-12 15:25:50 simmim_pretrain](main_simmim.py 218): INFO Train: [127/200][1500/6787] eta 0:22:16 lr 0.000200 time 0.2504 (0.2528) loss 0.3710 (0.3583) grad_norm 512367.7500 (394950.2500) mem 14543MB +[2023-10-12 15:27:56 simmim_pretrain](main_simmim.py 218): INFO Train: [127/200][2000/6787] eta 0:20:08 lr 0.000200 time 0.2451 (0.2525) loss 0.3821 (0.3580) grad_norm 757513.3125 (inf) mem 14543MB +[2023-10-12 15:30:02 simmim_pretrain](main_simmim.py 218): INFO Train: [127/200][2500/6787] eta 0:18:01 lr 0.000200 time 0.2518 (0.2523) loss 0.3453 (0.3579) grad_norm 254295.4062 (inf) mem 14543MB +[2023-10-12 15:32:07 simmim_pretrain](main_simmim.py 218): INFO Train: [127/200][3000/6787] eta 0:15:54 lr 0.000200 time 0.2530 (0.2521) loss 0.3657 (0.3578) grad_norm 592169.8125 (inf) mem 14543MB +[2023-10-12 15:34:13 simmim_pretrain](main_simmim.py 218): INFO Train: [127/200][3500/6787] eta 0:13:48 lr 0.000200 time 0.2486 (0.2520) loss 0.3401 (0.3579) grad_norm 303716.9062 (inf) mem 14543MB +[2023-10-12 15:36:19 simmim_pretrain](main_simmim.py 218): INFO Train: [127/200][4000/6787] eta 0:11:42 lr 0.000200 time 0.2478 (0.2519) loss 0.3599 (0.3583) grad_norm 257904.7188 (inf) mem 14543MB +[2023-10-12 15:38:24 simmim_pretrain](main_simmim.py 218): INFO Train: [127/200][4500/6787] eta 0:09:36 lr 0.000200 time 0.2487 (0.2519) loss 0.3648 (0.3587) grad_norm 163928.5469 (inf) mem 14543MB +[2023-10-12 15:40:30 simmim_pretrain](main_simmim.py 218): INFO Train: [127/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2456 (0.2517) loss 0.3663 (0.3590) grad_norm 155262.6250 (inf) mem 14543MB +[2023-10-12 15:42:35 simmim_pretrain](main_simmim.py 218): INFO Train: [127/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2517 (0.2516) loss 0.3553 (0.3592) grad_norm 81886.1719 (inf) mem 14543MB +[2023-10-12 15:44:40 simmim_pretrain](main_simmim.py 218): INFO Train: [127/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2506 (0.2516) loss 0.3552 (0.3593) grad_norm 118899.3672 (inf) mem 14543MB +[2023-10-12 15:46:46 simmim_pretrain](main_simmim.py 218): INFO Train: [127/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2531 (0.2515) loss 0.3488 (0.3593) grad_norm 201421.7500 (inf) mem 14543MB +[2023-10-12 15:47:59 simmim_pretrain](main_simmim.py 228): INFO EPOCH 127 training takes 0:28:28 +[2023-10-12 15:48:00 simmim_pretrain](main_simmim.py 218): INFO Train: [128/200][0/6787] eta 2:51:46 lr 0.000200 time 1.5186 (1.5186) loss 0.3862 (0.3862) grad_norm 217034.2500 (217034.2500) mem 14543MB +[2023-10-12 15:50:06 simmim_pretrain](main_simmim.py 218): INFO Train: [128/200][500/6787] eta 0:26:32 lr 0.000200 time 0.2527 (0.2533) loss 0.3431 (0.3603) grad_norm 282695.5938 (208315.7656) mem 14543MB +[2023-10-12 15:52:11 simmim_pretrain](main_simmim.py 218): INFO Train: [128/200][1000/6787] eta 0:24:16 lr 0.000200 time 0.2511 (0.2518) loss 0.3599 (0.3599) grad_norm 337960.5625 (212261.0469) mem 14543MB +[2023-10-12 15:54:16 simmim_pretrain](main_simmim.py 218): INFO Train: [128/200][1500/6787] eta 0:22:08 lr 0.000200 time 0.2469 (0.2513) loss 0.3473 (0.3593) grad_norm 215515.5625 (236224.3906) mem 14543MB +[2023-10-12 15:56:21 simmim_pretrain](main_simmim.py 218): INFO Train: [128/200][2000/6787] eta 0:20:01 lr 0.000200 time 0.2492 (0.2511) loss 0.3423 (0.3587) grad_norm 511594.0938 (278825.6562) mem 14543MB +[2023-10-12 15:58:26 simmim_pretrain](main_simmim.py 218): INFO Train: [128/200][2500/6787] eta 0:17:55 lr 0.000200 time 0.2483 (0.2509) loss 0.3569 (0.3582) grad_norm 458018.4375 (315098.5312) mem 14543MB +[2023-10-12 16:00:32 simmim_pretrain](main_simmim.py 218): INFO Train: [128/200][3000/6787] eta 0:15:50 lr 0.000200 time 0.2550 (0.2509) loss 0.3695 (0.3584) grad_norm 455031.3750 (331489.3750) mem 14543MB +[2023-10-12 16:02:37 simmim_pretrain](main_simmim.py 218): INFO Train: [128/200][3500/6787] eta 0:13:44 lr 0.000200 time 0.2518 (0.2509) loss 0.3697 (0.3585) grad_norm 248475.2500 (inf) mem 14543MB +[2023-10-12 16:04:43 simmim_pretrain](main_simmim.py 218): INFO Train: [128/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2464 (0.2509) loss 0.3881 (0.3586) grad_norm 173020.7031 (inf) mem 14543MB +[2023-10-12 16:06:48 simmim_pretrain](main_simmim.py 218): INFO Train: [128/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2457 (0.2509) loss 0.3588 (0.3586) grad_norm 155351.0312 (inf) mem 14543MB +[2023-10-12 16:08:53 simmim_pretrain](main_simmim.py 218): INFO Train: [128/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2514 (0.2509) loss 0.3717 (0.3586) grad_norm 252390.2812 (inf) mem 14543MB +[2023-10-12 16:10:59 simmim_pretrain](main_simmim.py 218): INFO Train: [128/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2518 (0.2509) loss 0.3841 (0.3586) grad_norm 223258.7344 (inf) mem 14543MB +[2023-10-12 16:13:04 simmim_pretrain](main_simmim.py 218): INFO Train: [128/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2522 (0.2508) loss 0.3480 (0.3586) grad_norm 371494.4062 (inf) mem 14543MB +[2023-10-12 16:15:10 simmim_pretrain](main_simmim.py 218): INFO Train: [128/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2464 (0.2508) loss 0.3520 (0.3586) grad_norm 354770.1562 (inf) mem 14543MB +[2023-10-12 16:16:22 simmim_pretrain](main_simmim.py 228): INFO EPOCH 128 training takes 0:28:23 +[2023-10-12 16:16:23 simmim_pretrain](main_simmim.py 218): INFO Train: [129/200][0/6787] eta 2:44:01 lr 0.000200 time 1.4500 (1.4500) loss 0.3577 (0.3577) grad_norm 259416.8906 (259416.8906) mem 14543MB +[2023-10-12 16:18:28 simmim_pretrain](main_simmim.py 218): INFO Train: [129/200][500/6787] eta 0:26:26 lr 0.000200 time 0.2465 (0.2523) loss 0.3518 (0.3601) grad_norm 321515.8438 (inf) mem 14543MB +[2023-10-12 16:20:34 simmim_pretrain](main_simmim.py 218): INFO Train: [129/200][1000/6787] eta 0:24:16 lr 0.000200 time 0.2523 (0.2516) loss 0.3727 (0.3594) grad_norm 193473.9531 (inf) mem 14543MB +[2023-10-12 16:22:39 simmim_pretrain](main_simmim.py 218): INFO Train: [129/200][1500/6787] eta 0:22:08 lr 0.000200 time 0.2533 (0.2513) loss 0.3690 (0.3595) grad_norm 194347.1406 (inf) mem 14543MB +[2023-10-12 16:24:45 simmim_pretrain](main_simmim.py 218): INFO Train: [129/200][2000/6787] eta 0:20:02 lr 0.000200 time 0.2460 (0.2512) loss 0.3686 (0.3600) grad_norm 123327.3047 (inf) mem 14543MB +[2023-10-12 16:26:50 simmim_pretrain](main_simmim.py 218): INFO Train: [129/200][2500/6787] eta 0:17:56 lr 0.000200 time 0.2565 (0.2511) loss 0.3478 (0.3602) grad_norm 156901.4844 (inf) mem 14543MB +[2023-10-12 16:28:55 simmim_pretrain](main_simmim.py 218): INFO Train: [129/200][3000/6787] eta 0:15:50 lr 0.000200 time 0.2499 (0.2511) loss 0.3757 (0.3605) grad_norm 135104.0938 (inf) mem 14543MB +[2023-10-12 16:31:01 simmim_pretrain](main_simmim.py 218): INFO Train: [129/200][3500/6787] eta 0:13:45 lr 0.000200 time 0.2504 (0.2510) loss 0.3595 (0.3606) grad_norm 85180.9297 (inf) mem 14543MB +[2023-10-12 16:33:06 simmim_pretrain](main_simmim.py 218): INFO Train: [129/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2515 (0.2511) loss 0.3595 (0.3605) grad_norm 101380.2188 (inf) mem 14543MB +[2023-10-12 16:35:12 simmim_pretrain](main_simmim.py 218): INFO Train: [129/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2495 (0.2510) loss 0.3749 (0.3605) grad_norm 129272.8047 (inf) mem 14543MB +[2023-10-12 16:37:17 simmim_pretrain](main_simmim.py 218): INFO Train: [129/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2528 (0.2510) loss 0.3439 (0.3603) grad_norm 196888.4375 (inf) mem 14543MB +[2023-10-12 16:39:23 simmim_pretrain](main_simmim.py 218): INFO Train: [129/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2515 (0.2511) loss 0.3673 (0.3601) grad_norm 250946.1094 (inf) mem 14543MB +[2023-10-12 16:41:29 simmim_pretrain](main_simmim.py 218): INFO Train: [129/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2530 (0.2511) loss 0.3446 (0.3599) grad_norm 252532.6875 (inf) mem 14543MB +[2023-10-12 16:43:36 simmim_pretrain](main_simmim.py 218): INFO Train: [129/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2533 (0.2513) loss 0.3712 (0.3597) grad_norm 426896.1875 (inf) mem 14543MB +[2023-10-12 16:44:50 simmim_pretrain](main_simmim.py 228): INFO EPOCH 129 training takes 0:28:27 +[2023-10-12 16:44:51 simmim_pretrain](main_simmim.py 218): INFO Train: [130/200][0/6787] eta 2:54:38 lr 0.000200 time 1.5439 (1.5439) loss 0.3519 (0.3519) grad_norm 471710.4375 (471710.4375) mem 14543MB +[2023-10-12 16:46:58 simmim_pretrain](main_simmim.py 218): INFO Train: [130/200][500/6787] eta 0:26:55 lr 0.000200 time 0.2552 (0.2569) loss 0.3599 (0.3592) grad_norm 276504.0000 (371220.8125) mem 14543MB +[2023-10-12 16:49:06 simmim_pretrain](main_simmim.py 218): INFO Train: [130/200][1000/6787] eta 0:24:40 lr 0.000200 time 0.2507 (0.2558) loss 0.3549 (0.3580) grad_norm 342059.5312 (inf) mem 14543MB +[2023-10-12 16:51:13 simmim_pretrain](main_simmim.py 218): INFO Train: [130/200][1500/6787] eta 0:22:30 lr 0.000200 time 0.2587 (0.2554) loss 0.3576 (0.3577) grad_norm 235381.9531 (inf) mem 14543MB +[2023-10-12 16:53:20 simmim_pretrain](main_simmim.py 218): INFO Train: [130/200][2000/6787] eta 0:20:20 lr 0.000200 time 0.2542 (0.2551) loss 0.3481 (0.3577) grad_norm 617979.8125 (inf) mem 14543MB +[2023-10-12 16:55:27 simmim_pretrain](main_simmim.py 218): INFO Train: [130/200][2500/6787] eta 0:18:12 lr 0.000200 time 0.2550 (0.2548) loss 0.3503 (0.3577) grad_norm 393041.7812 (inf) mem 14543MB +[2023-10-12 16:57:34 simmim_pretrain](main_simmim.py 218): INFO Train: [130/200][3000/6787] eta 0:16:04 lr 0.000200 time 0.2533 (0.2546) loss 0.3402 (0.3577) grad_norm 603406.9375 (inf) mem 14543MB +[2023-10-12 16:59:40 simmim_pretrain](main_simmim.py 218): INFO Train: [130/200][3500/6787] eta 0:13:56 lr 0.000200 time 0.2511 (0.2544) loss 0.3503 (0.3578) grad_norm 366752.0938 (inf) mem 14543MB +[2023-10-12 17:01:47 simmim_pretrain](main_simmim.py 218): INFO Train: [130/200][4000/6787] eta 0:11:48 lr 0.000200 time 0.2523 (0.2543) loss 0.3662 (0.3578) grad_norm 262770.4375 (inf) mem 14543MB +[2023-10-12 17:03:53 simmim_pretrain](main_simmim.py 218): INFO Train: [130/200][4500/6787] eta 0:09:41 lr 0.000200 time 0.2535 (0.2541) loss 0.3482 (0.3580) grad_norm 242703.5000 (inf) mem 14543MB +[2023-10-12 17:06:00 simmim_pretrain](main_simmim.py 218): INFO Train: [130/200][5000/6787] eta 0:07:33 lr 0.000200 time 0.2532 (0.2540) loss 0.3486 (0.3582) grad_norm 268152.5312 (inf) mem 14543MB +[2023-10-12 17:08:06 simmim_pretrain](main_simmim.py 218): INFO Train: [130/200][5500/6787] eta 0:05:26 lr 0.000200 time 0.2540 (0.2539) loss 0.3716 (0.3583) grad_norm 327990.4062 (inf) mem 14543MB +[2023-10-12 17:10:12 simmim_pretrain](main_simmim.py 218): INFO Train: [130/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2523 (0.2538) loss 0.3543 (0.3583) grad_norm 245587.0312 (inf) mem 14543MB +[2023-10-12 17:12:19 simmim_pretrain](main_simmim.py 218): INFO Train: [130/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2504 (0.2537) loss 0.3670 (0.3582) grad_norm 291439.6562 (inf) mem 14543MB +[2023-10-12 17:13:32 simmim_pretrain](main_simmim.py 228): INFO EPOCH 130 training takes 0:28:42 +[2023-10-12 17:13:33 simmim_pretrain](main_simmim.py 218): INFO Train: [131/200][0/6787] eta 2:31:28 lr 0.000200 time 1.3391 (1.3391) loss 0.3429 (0.3429) grad_norm 155737.2969 (155737.2969) mem 14543MB +[2023-10-12 17:15:38 simmim_pretrain](main_simmim.py 218): INFO Train: [131/200][500/6787] eta 0:26:30 lr 0.000200 time 0.2496 (0.2529) loss 0.3679 (0.3612) grad_norm 93049.5078 (148288.1094) mem 14543MB +[2023-10-12 17:17:44 simmim_pretrain](main_simmim.py 218): INFO Train: [131/200][1000/6787] eta 0:24:17 lr 0.000200 time 0.2483 (0.2518) loss 0.3485 (0.3614) grad_norm 129578.6172 (141761.3281) mem 14543MB +[2023-10-12 17:19:49 simmim_pretrain](main_simmim.py 218): INFO Train: [131/200][1500/6787] eta 0:22:09 lr 0.000200 time 0.2515 (0.2514) loss 0.3749 (0.3612) grad_norm 136663.1406 (138612.8281) mem 14543MB +[2023-10-12 17:21:54 simmim_pretrain](main_simmim.py 218): INFO Train: [131/200][2000/6787] eta 0:20:02 lr 0.000200 time 0.2501 (0.2513) loss 0.3553 (0.3611) grad_norm 133734.3750 (142076.5781) mem 14543MB +[2023-10-12 17:24:00 simmim_pretrain](main_simmim.py 218): INFO Train: [131/200][2500/6787] eta 0:17:56 lr 0.000200 time 0.2496 (0.2512) loss 0.3550 (0.3609) grad_norm 408582.1875 (153667.4531) mem 14543MB +[2023-10-12 17:26:05 simmim_pretrain](main_simmim.py 218): INFO Train: [131/200][3000/6787] eta 0:15:51 lr 0.000200 time 0.2512 (0.2511) loss 0.3580 (0.3606) grad_norm 282452.0938 (162964.1406) mem 14543MB +[2023-10-12 17:28:11 simmim_pretrain](main_simmim.py 218): INFO Train: [131/200][3500/6787] eta 0:13:45 lr 0.000200 time 0.2468 (0.2511) loss 0.3567 (0.3603) grad_norm 223185.2500 (171436.6875) mem 14543MB +[2023-10-12 17:30:16 simmim_pretrain](main_simmim.py 218): INFO Train: [131/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2469 (0.2511) loss 0.3557 (0.3600) grad_norm 321513.8438 (181662.6406) mem 14543MB +[2023-10-12 17:32:22 simmim_pretrain](main_simmim.py 218): INFO Train: [131/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2509 (0.2511) loss 0.3737 (0.3598) grad_norm 221285.6406 (inf) mem 14543MB +[2023-10-12 17:34:28 simmim_pretrain](main_simmim.py 218): INFO Train: [131/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2491 (0.2512) loss 0.3595 (0.3596) grad_norm 258731.4062 (inf) mem 14543MB +[2023-10-12 17:36:33 simmim_pretrain](main_simmim.py 218): INFO Train: [131/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2491 (0.2512) loss 0.3390 (0.3594) grad_norm 138180.3594 (inf) mem 14543MB +[2023-10-12 17:38:39 simmim_pretrain](main_simmim.py 218): INFO Train: [131/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2550 (0.2511) loss 0.3448 (0.3594) grad_norm 338153.8750 (inf) mem 14543MB +[2023-10-12 17:40:44 simmim_pretrain](main_simmim.py 218): INFO Train: [131/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2542 (0.2511) loss 0.3889 (0.3594) grad_norm 120454.3828 (inf) mem 14543MB +[2023-10-12 17:41:56 simmim_pretrain](main_simmim.py 228): INFO EPOCH 131 training takes 0:28:24 +[2023-10-12 17:41:58 simmim_pretrain](main_simmim.py 218): INFO Train: [132/200][0/6787] eta 2:46:38 lr 0.000200 time 1.4732 (1.4732) loss 0.3678 (0.3678) grad_norm 200131.4062 (200131.4062) mem 14543MB +[2023-10-12 17:44:03 simmim_pretrain](main_simmim.py 218): INFO Train: [132/200][500/6787] eta 0:26:29 lr 0.000200 time 0.2553 (0.2528) loss 0.3516 (0.3598) grad_norm 206942.5312 (235991.3906) mem 14543MB +[2023-10-12 17:46:09 simmim_pretrain](main_simmim.py 218): INFO Train: [132/200][1000/6787] eta 0:24:17 lr 0.000200 time 0.2513 (0.2519) loss 0.3686 (0.3591) grad_norm 165154.3750 (237167.0000) mem 14543MB +[2023-10-12 17:48:14 simmim_pretrain](main_simmim.py 218): INFO Train: [132/200][1500/6787] eta 0:22:09 lr 0.000200 time 0.2540 (0.2515) loss 0.3346 (0.3600) grad_norm 162795.5781 (inf) mem 14543MB +[2023-10-12 17:50:19 simmim_pretrain](main_simmim.py 218): INFO Train: [132/200][2000/6787] eta 0:20:02 lr 0.000200 time 0.2529 (0.2512) loss 0.3642 (0.3606) grad_norm 124697.1172 (inf) mem 14543MB +[2023-10-12 17:52:24 simmim_pretrain](main_simmim.py 218): INFO Train: [132/200][2500/6787] eta 0:17:56 lr 0.000200 time 0.2501 (0.2510) loss 0.3689 (0.3612) grad_norm 95991.1250 (inf) mem 14543MB +[2023-10-12 17:54:30 simmim_pretrain](main_simmim.py 218): INFO Train: [132/200][3000/6787] eta 0:15:50 lr 0.000200 time 0.2466 (0.2510) loss 0.3848 (0.3619) grad_norm 61610.3945 (inf) mem 14543MB +[2023-10-12 17:56:35 simmim_pretrain](main_simmim.py 218): INFO Train: [132/200][3500/6787] eta 0:13:44 lr 0.000200 time 0.2473 (0.2509) loss 0.3727 (0.3623) grad_norm 62217.8867 (inf) mem 14543MB +[2023-10-12 17:58:40 simmim_pretrain](main_simmim.py 218): INFO Train: [132/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2538 (0.2509) loss 0.3864 (0.3624) grad_norm 72399.7812 (inf) mem 14543MB +[2023-10-12 18:00:45 simmim_pretrain](main_simmim.py 218): INFO Train: [132/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2534 (0.2508) loss 0.3385 (0.3625) grad_norm 87369.2734 (inf) mem 14543MB +[2023-10-12 18:02:51 simmim_pretrain](main_simmim.py 218): INFO Train: [132/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2466 (0.2508) loss 0.3484 (0.3623) grad_norm 137244.1250 (inf) mem 14543MB +[2023-10-12 18:04:56 simmim_pretrain](main_simmim.py 218): INFO Train: [132/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2459 (0.2508) loss 0.3530 (0.3621) grad_norm 165874.4688 (inf) mem 14543MB +[2023-10-12 18:07:01 simmim_pretrain](main_simmim.py 218): INFO Train: [132/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2501 (0.2508) loss 0.3632 (0.3619) grad_norm 136591.6094 (inf) mem 14543MB +[2023-10-12 18:09:06 simmim_pretrain](main_simmim.py 218): INFO Train: [132/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2504 (0.2507) loss 0.3445 (0.3618) grad_norm 135790.5781 (inf) mem 14543MB +[2023-10-12 18:10:19 simmim_pretrain](main_simmim.py 228): INFO EPOCH 132 training takes 0:28:22 +[2023-10-12 18:10:21 simmim_pretrain](main_simmim.py 218): INFO Train: [133/200][0/6787] eta 3:29:28 lr 0.000200 time 1.8519 (1.8519) loss 0.3727 (0.3727) grad_norm 97838.2031 (97838.2031) mem 14543MB +[2023-10-12 18:12:26 simmim_pretrain](main_simmim.py 218): INFO Train: [133/200][500/6787] eta 0:26:31 lr 0.000200 time 0.2515 (0.2531) loss 0.3704 (0.3590) grad_norm 193506.2188 (157063.5312) mem 14543MB +[2023-10-12 18:14:31 simmim_pretrain](main_simmim.py 218): INFO Train: [133/200][1000/6787] eta 0:24:16 lr 0.000200 time 0.2538 (0.2517) loss 0.3380 (0.3586) grad_norm 227608.5625 (170667.0781) mem 14543MB +[2023-10-12 18:16:36 simmim_pretrain](main_simmim.py 218): INFO Train: [133/200][1500/6787] eta 0:22:08 lr 0.000200 time 0.2473 (0.2514) loss 0.3699 (0.3582) grad_norm 175783.0469 (182904.2656) mem 14543MB +[2023-10-12 18:18:41 simmim_pretrain](main_simmim.py 218): INFO Train: [133/200][2000/6787] eta 0:20:01 lr 0.000200 time 0.2491 (0.2511) loss 0.3464 (0.3578) grad_norm 581568.2500 (204627.3906) mem 14543MB +[2023-10-12 18:20:47 simmim_pretrain](main_simmim.py 218): INFO Train: [133/200][2500/6787] eta 0:17:55 lr 0.000200 time 0.2501 (0.2510) loss 0.3436 (0.3577) grad_norm 329301.3438 (223439.9375) mem 14543MB +[2023-10-12 18:22:52 simmim_pretrain](main_simmim.py 218): INFO Train: [133/200][3000/6787] eta 0:15:50 lr 0.000200 time 0.2526 (0.2509) loss 0.3609 (0.3574) grad_norm 442782.5312 (252066.7656) mem 14543MB +[2023-10-12 18:24:57 simmim_pretrain](main_simmim.py 218): INFO Train: [133/200][3500/6787] eta 0:13:44 lr 0.000200 time 0.2496 (0.2508) loss 0.3751 (0.3572) grad_norm 326120.0625 (inf) mem 14543MB +[2023-10-12 18:27:02 simmim_pretrain](main_simmim.py 218): INFO Train: [133/200][4000/6787] eta 0:11:38 lr 0.000200 time 0.2509 (0.2508) loss 0.3789 (0.3573) grad_norm 178101.1719 (inf) mem 14543MB +[2023-10-12 18:29:08 simmim_pretrain](main_simmim.py 218): INFO Train: [133/200][4500/6787] eta 0:09:33 lr 0.000200 time 0.2502 (0.2507) loss 0.3726 (0.3576) grad_norm 193723.7344 (inf) mem 14543MB +[2023-10-12 18:31:13 simmim_pretrain](main_simmim.py 218): INFO Train: [133/200][5000/6787] eta 0:07:27 lr 0.000200 time 0.2459 (0.2507) loss 0.3557 (0.3580) grad_norm 140253.6875 (inf) mem 14543MB +[2023-10-12 18:33:18 simmim_pretrain](main_simmim.py 218): INFO Train: [133/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2536 (0.2506) loss 0.3665 (0.3583) grad_norm 127696.6719 (inf) mem 14543MB +[2023-10-12 18:35:23 simmim_pretrain](main_simmim.py 218): INFO Train: [133/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2490 (0.2507) loss 0.3338 (0.3585) grad_norm 123458.5078 (inf) mem 14543MB +[2023-10-12 18:37:29 simmim_pretrain](main_simmim.py 218): INFO Train: [133/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2465 (0.2507) loss 0.3659 (0.3586) grad_norm 133800.5625 (inf) mem 14543MB +[2023-10-12 18:38:41 simmim_pretrain](main_simmim.py 228): INFO EPOCH 133 training takes 0:28:22 +[2023-10-12 18:38:43 simmim_pretrain](main_simmim.py 218): INFO Train: [134/200][0/6787] eta 2:45:43 lr 0.000200 time 1.4651 (1.4651) loss 0.3602 (0.3602) grad_norm 185661.7031 (185661.7031) mem 14543MB +[2023-10-12 18:40:48 simmim_pretrain](main_simmim.py 218): INFO Train: [134/200][500/6787] eta 0:26:31 lr 0.000200 time 0.2529 (0.2531) loss 0.3573 (0.3572) grad_norm 159416.6875 (171021.7812) mem 14543MB +[2023-10-12 18:42:53 simmim_pretrain](main_simmim.py 218): INFO Train: [134/200][1000/6787] eta 0:24:18 lr 0.000200 time 0.2546 (0.2520) loss 0.3599 (0.3586) grad_norm 202375.3906 (193314.6250) mem 14543MB +[2023-10-12 18:44:59 simmim_pretrain](main_simmim.py 218): INFO Train: [134/200][1500/6787] eta 0:22:10 lr 0.000200 time 0.2523 (0.2517) loss 0.3494 (0.3586) grad_norm 161611.0781 (201038.0312) mem 14543MB +[2023-10-12 18:47:05 simmim_pretrain](main_simmim.py 218): INFO Train: [134/200][2000/6787] eta 0:20:05 lr 0.000200 time 0.2452 (0.2519) loss 0.3617 (0.3584) grad_norm 285122.7812 (221396.8750) mem 14543MB +[2023-10-12 18:49:12 simmim_pretrain](main_simmim.py 218): INFO Train: [134/200][2500/6787] eta 0:18:00 lr 0.000200 time 0.2537 (0.2521) loss 0.3521 (0.3583) grad_norm 375998.7500 (236736.6094) mem 14543MB +[2023-10-12 18:51:18 simmim_pretrain](main_simmim.py 218): INFO Train: [134/200][3000/6787] eta 0:15:55 lr 0.000200 time 0.2536 (0.2523) loss 0.3517 (0.3580) grad_norm 201996.0156 (inf) mem 14543MB +[2023-10-12 18:53:26 simmim_pretrain](main_simmim.py 218): INFO Train: [134/200][3500/6787] eta 0:13:50 lr 0.000200 time 0.2529 (0.2526) loss 0.3649 (0.3581) grad_norm 272885.8125 (inf) mem 14543MB +[2023-10-12 18:55:32 simmim_pretrain](main_simmim.py 218): INFO Train: [134/200][4000/6787] eta 0:11:44 lr 0.000200 time 0.2530 (0.2527) loss 0.3591 (0.3582) grad_norm 244011.6094 (inf) mem 14543MB +[2023-10-12 18:57:39 simmim_pretrain](main_simmim.py 218): INFO Train: [134/200][4500/6787] eta 0:09:38 lr 0.000200 time 0.2570 (0.2528) loss 0.3658 (0.3583) grad_norm 191045.4375 (inf) mem 14543MB +[2023-10-12 18:59:46 simmim_pretrain](main_simmim.py 218): INFO Train: [134/200][5000/6787] eta 0:07:31 lr 0.000200 time 0.2474 (0.2528) loss 0.3627 (0.3583) grad_norm 210572.5938 (inf) mem 14543MB +[2023-10-12 19:01:52 simmim_pretrain](main_simmim.py 218): INFO Train: [134/200][5500/6787] eta 0:05:25 lr 0.000200 time 0.2566 (0.2529) loss 0.3706 (0.3582) grad_norm 584391.1250 (inf) mem 14543MB +[2023-10-12 19:04:00 simmim_pretrain](main_simmim.py 218): INFO Train: [134/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2538 (0.2531) loss 0.3448 (0.3581) grad_norm 215602.3281 (inf) mem 14543MB +[2023-10-12 19:06:08 simmim_pretrain](main_simmim.py 218): INFO Train: [134/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2540 (0.2533) loss 0.3470 (0.3580) grad_norm 324175.9375 (inf) mem 14543MB +[2023-10-12 19:07:21 simmim_pretrain](main_simmim.py 228): INFO EPOCH 134 training takes 0:28:40 +[2023-10-12 19:07:23 simmim_pretrain](main_simmim.py 218): INFO Train: [135/200][0/6787] eta 2:47:38 lr 0.000200 time 1.4820 (1.4820) loss 0.3740 (0.3740) grad_norm 411637.9375 (411637.9375) mem 14543MB +[2023-10-12 19:09:29 simmim_pretrain](main_simmim.py 218): INFO Train: [135/200][500/6787] eta 0:26:42 lr 0.000200 time 0.2594 (0.2550) loss 0.3374 (0.3573) grad_norm 286568.0000 (inf) mem 14543MB +[2023-10-12 19:11:35 simmim_pretrain](main_simmim.py 218): INFO Train: [135/200][1000/6787] eta 0:24:28 lr 0.000200 time 0.2591 (0.2538) loss 0.3576 (0.3568) grad_norm 364674.8750 (inf) mem 14543MB +[2023-10-12 19:13:42 simmim_pretrain](main_simmim.py 218): INFO Train: [135/200][1500/6787] eta 0:22:19 lr 0.000200 time 0.2546 (0.2534) loss 0.3367 (0.3570) grad_norm 402828.5938 (inf) mem 14543MB +[2023-10-12 19:15:48 simmim_pretrain](main_simmim.py 218): INFO Train: [135/200][2000/6787] eta 0:20:12 lr 0.000200 time 0.2540 (0.2533) loss 0.3637 (0.3569) grad_norm 463561.6562 (inf) mem 14543MB +[2023-10-12 19:17:55 simmim_pretrain](main_simmim.py 218): INFO Train: [135/200][2500/6787] eta 0:18:05 lr 0.000200 time 0.2556 (0.2533) loss 0.3554 (0.3567) grad_norm 254329.4688 (inf) mem 14543MB +[2023-10-12 19:20:01 simmim_pretrain](main_simmim.py 218): INFO Train: [135/200][3000/6787] eta 0:15:59 lr 0.000200 time 0.2468 (0.2533) loss 0.3444 (0.3566) grad_norm 396845.4062 (inf) mem 14543MB +[2023-10-12 19:22:09 simmim_pretrain](main_simmim.py 218): INFO Train: [135/200][3500/6787] eta 0:13:53 lr 0.000200 time 0.2538 (0.2534) loss 0.3718 (0.3567) grad_norm 240162.9688 (inf) mem 14543MB +[2023-10-12 19:24:18 simmim_pretrain](main_simmim.py 218): INFO Train: [135/200][4000/6787] eta 0:11:47 lr 0.000200 time 0.2541 (0.2540) loss 0.3508 (0.3568) grad_norm 281023.1250 (inf) mem 14543MB +[2023-10-12 19:26:26 simmim_pretrain](main_simmim.py 218): INFO Train: [135/200][4500/6787] eta 0:09:41 lr 0.000200 time 0.2559 (0.2544) loss 0.3683 (0.3569) grad_norm 421727.2188 (inf) mem 14543MB +[2023-10-12 19:28:35 simmim_pretrain](main_simmim.py 218): INFO Train: [135/200][5000/6787] eta 0:07:35 lr 0.000200 time 0.2594 (0.2547) loss 0.3510 (0.3569) grad_norm 552607.7500 (inf) mem 14543MB +[2023-10-12 19:30:43 simmim_pretrain](main_simmim.py 218): INFO Train: [135/200][5500/6787] eta 0:05:27 lr 0.000200 time 0.2578 (0.2548) loss 0.3710 (0.3569) grad_norm 757826.7500 (inf) mem 14543MB +[2023-10-12 19:32:51 simmim_pretrain](main_simmim.py 218): INFO Train: [135/200][6000/6787] eta 0:03:20 lr 0.000200 time 0.2540 (0.2549) loss 0.3531 (0.3569) grad_norm 455450.0938 (inf) mem 14543MB +[2023-10-12 19:34:59 simmim_pretrain](main_simmim.py 218): INFO Train: [135/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2551 (0.2550) loss 0.3612 (0.3570) grad_norm 351890.0625 (inf) mem 14543MB +[2023-10-12 19:36:13 simmim_pretrain](main_simmim.py 228): INFO EPOCH 135 training takes 0:28:51 +[2023-10-12 19:36:14 simmim_pretrain](main_simmim.py 218): INFO Train: [136/200][0/6787] eta 2:48:19 lr 0.000200 time 1.4881 (1.4881) loss 0.3467 (0.3467) grad_norm 247090.6250 (247090.6250) mem 14543MB +[2023-10-12 19:38:21 simmim_pretrain](main_simmim.py 218): INFO Train: [136/200][500/6787] eta 0:26:46 lr 0.000200 time 0.2521 (0.2556) loss 0.3630 (0.3592) grad_norm 289176.0312 (246966.4375) mem 14543MB +[2023-10-12 19:40:28 simmim_pretrain](main_simmim.py 218): INFO Train: [136/200][1000/6787] eta 0:24:34 lr 0.000200 time 0.2555 (0.2547) loss 0.3498 (0.3589) grad_norm 276987.5000 (248842.0156) mem 14543MB +[2023-10-12 19:42:35 simmim_pretrain](main_simmim.py 218): INFO Train: [136/200][1500/6787] eta 0:22:25 lr 0.000200 time 0.2494 (0.2544) loss 0.3560 (0.3588) grad_norm 259091.4844 (246634.2969) mem 14543MB +[2023-10-12 19:44:43 simmim_pretrain](main_simmim.py 218): INFO Train: [136/200][2000/6787] eta 0:20:20 lr 0.000200 time 0.2600 (0.2550) loss 0.3680 (0.3588) grad_norm 312565.5625 (252054.2500) mem 14543MB +[2023-10-12 19:46:53 simmim_pretrain](main_simmim.py 218): INFO Train: [136/200][2500/6787] eta 0:18:16 lr 0.000200 time 0.2602 (0.2559) loss 0.3583 (0.3582) grad_norm 245105.7812 (282024.2188) mem 14543MB +[2023-10-12 19:49:02 simmim_pretrain](main_simmim.py 218): INFO Train: [136/200][3000/6787] eta 0:16:11 lr 0.000200 time 0.2604 (0.2564) loss 0.3767 (0.3581) grad_norm 230134.0000 (inf) mem 14543MB +[2023-10-12 19:51:12 simmim_pretrain](main_simmim.py 218): INFO Train: [136/200][3500/6787] eta 0:14:04 lr 0.000200 time 0.2601 (0.2569) loss 0.3369 (0.3582) grad_norm 246846.2500 (inf) mem 14543MB +[2023-10-12 19:53:22 simmim_pretrain](main_simmim.py 218): INFO Train: [136/200][4000/6787] eta 0:11:56 lr 0.000200 time 0.2597 (0.2572) loss 0.3494 (0.3582) grad_norm 186701.0312 (inf) mem 14543MB +[2023-10-12 19:55:32 simmim_pretrain](main_simmim.py 218): INFO Train: [136/200][4500/6787] eta 0:09:48 lr 0.000200 time 0.2611 (0.2574) loss 0.3485 (0.3583) grad_norm 285443.7812 (inf) mem 14543MB +[2023-10-12 19:57:41 simmim_pretrain](main_simmim.py 218): INFO Train: [136/200][5000/6787] eta 0:07:40 lr 0.000200 time 0.2606 (0.2576) loss 0.3421 (0.3583) grad_norm 302235.7500 (inf) mem 14543MB +[2023-10-12 19:59:51 simmim_pretrain](main_simmim.py 218): INFO Train: [136/200][5500/6787] eta 0:05:31 lr 0.000200 time 0.2600 (0.2578) loss 0.3415 (0.3582) grad_norm 318666.1562 (inf) mem 14543MB +[2023-10-12 20:02:01 simmim_pretrain](main_simmim.py 218): INFO Train: [136/200][6000/6787] eta 0:03:22 lr 0.000200 time 0.2593 (0.2579) loss 0.3480 (0.3582) grad_norm 521203.0312 (inf) mem 14543MB +[2023-10-12 20:04:10 simmim_pretrain](main_simmim.py 218): INFO Train: [136/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2576 (0.2580) loss 0.3527 (0.3581) grad_norm 325966.9375 (inf) mem 14543MB +[2023-10-12 20:05:25 simmim_pretrain](main_simmim.py 228): INFO EPOCH 136 training takes 0:29:12 +[2023-10-12 20:05:27 simmim_pretrain](main_simmim.py 218): INFO Train: [137/200][0/6787] eta 2:44:03 lr 0.000200 time 1.4503 (1.4503) loss 0.3651 (0.3651) grad_norm 582980.5000 (582980.5000) mem 14543MB +[2023-10-12 20:07:34 simmim_pretrain](main_simmim.py 218): INFO Train: [137/200][500/6787] eta 0:26:56 lr 0.000200 time 0.2566 (0.2572) loss 0.3694 (0.3567) grad_norm 660918.7500 (451245.0625) mem 14543MB +[2023-10-12 20:09:44 simmim_pretrain](main_simmim.py 218): INFO Train: [137/200][1000/6787] eta 0:24:55 lr 0.000200 time 0.2541 (0.2585) loss 0.3670 (0.3567) grad_norm 429719.5625 (inf) mem 14543MB +[2023-10-12 20:11:54 simmim_pretrain](main_simmim.py 218): INFO Train: [137/200][1500/6787] eta 0:22:47 lr 0.000200 time 0.2586 (0.2587) loss 0.3533 (0.3577) grad_norm 226942.7812 (inf) mem 14543MB +[2023-10-12 20:14:03 simmim_pretrain](main_simmim.py 218): INFO Train: [137/200][2000/6787] eta 0:20:37 lr 0.000200 time 0.2582 (0.2586) loss 0.3430 (0.3578) grad_norm 114029.3672 (inf) mem 14543MB +[2023-10-12 20:16:12 simmim_pretrain](main_simmim.py 218): INFO Train: [137/200][2500/6787] eta 0:18:28 lr 0.000200 time 0.2582 (0.2585) loss 0.3340 (0.3581) grad_norm 342805.9062 (inf) mem 14543MB +[2023-10-12 20:18:21 simmim_pretrain](main_simmim.py 218): INFO Train: [137/200][3000/6787] eta 0:16:18 lr 0.000200 time 0.2587 (0.2585) loss 0.3495 (0.3582) grad_norm 198535.3906 (inf) mem 14543MB +[2023-10-12 20:20:30 simmim_pretrain](main_simmim.py 218): INFO Train: [137/200][3500/6787] eta 0:14:09 lr 0.000200 time 0.2578 (0.2584) loss 0.3737 (0.3580) grad_norm 348686.5312 (inf) mem 14543MB +[2023-10-12 20:22:39 simmim_pretrain](main_simmim.py 218): INFO Train: [137/200][4000/6787] eta 0:12:00 lr 0.000200 time 0.2588 (0.2584) loss 0.3506 (0.3580) grad_norm 167342.8281 (inf) mem 14543MB +[2023-10-12 20:24:48 simmim_pretrain](main_simmim.py 218): INFO Train: [137/200][4500/6787] eta 0:09:50 lr 0.000200 time 0.2593 (0.2584) loss 0.3516 (0.3582) grad_norm 280914.9375 (inf) mem 14543MB +[2023-10-12 20:26:57 simmim_pretrain](main_simmim.py 218): INFO Train: [137/200][5000/6787] eta 0:07:41 lr 0.000200 time 0.2575 (0.2583) loss 0.3552 (0.3584) grad_norm 261513.7500 (inf) mem 14543MB +[2023-10-12 20:29:06 simmim_pretrain](main_simmim.py 218): INFO Train: [137/200][5500/6787] eta 0:05:32 lr 0.000200 time 0.2569 (0.2582) loss 0.3259 (0.3584) grad_norm 236525.0312 (inf) mem 14543MB +[2023-10-12 20:31:15 simmim_pretrain](main_simmim.py 218): INFO Train: [137/200][6000/6787] eta 0:03:23 lr 0.000200 time 0.2579 (0.2582) loss 0.3737 (0.3585) grad_norm 262438.4062 (inf) mem 14543MB +[2023-10-12 20:33:24 simmim_pretrain](main_simmim.py 218): INFO Train: [137/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2574 (0.2582) loss 0.3743 (0.3583) grad_norm 383274.7188 (inf) mem 14543MB +[2023-10-12 20:34:38 simmim_pretrain](main_simmim.py 228): INFO EPOCH 137 training takes 0:29:13 +[2023-10-12 20:34:40 simmim_pretrain](main_simmim.py 218): INFO Train: [138/200][0/6787] eta 2:53:26 lr 0.000200 time 1.5333 (1.5333) loss 0.3688 (0.3688) grad_norm 423714.8438 (423714.8438) mem 14543MB +[2023-10-12 20:36:46 simmim_pretrain](main_simmim.py 218): INFO Train: [138/200][500/6787] eta 0:26:46 lr 0.000200 time 0.2546 (0.2555) loss 0.3594 (0.3572) grad_norm 356931.8125 (inf) mem 14543MB +[2023-10-12 20:38:52 simmim_pretrain](main_simmim.py 218): INFO Train: [138/200][1000/6787] eta 0:24:28 lr 0.000200 time 0.2536 (0.2537) loss 0.3698 (0.3579) grad_norm 201832.4062 (inf) mem 14543MB +[2023-10-12 20:40:58 simmim_pretrain](main_simmim.py 218): INFO Train: [138/200][1500/6787] eta 0:22:18 lr 0.000200 time 0.2507 (0.2531) loss 0.3575 (0.3582) grad_norm 335909.4375 (inf) mem 14543MB +[2023-10-12 20:43:05 simmim_pretrain](main_simmim.py 218): INFO Train: [138/200][2000/6787] eta 0:20:11 lr 0.000200 time 0.2508 (0.2530) loss 0.3636 (0.3583) grad_norm 392523.2812 (inf) mem 14543MB +[2023-10-12 20:45:11 simmim_pretrain](main_simmim.py 218): INFO Train: [138/200][2500/6787] eta 0:18:04 lr 0.000200 time 0.2519 (0.2530) loss 0.3639 (0.3591) grad_norm 117083.3594 (nan) mem 14543MB +[2023-10-12 20:47:18 simmim_pretrain](main_simmim.py 218): INFO Train: [138/200][3000/6787] eta 0:15:58 lr 0.000200 time 0.2488 (0.2531) loss 0.3443 (0.3598) grad_norm 98810.0312 (nan) mem 14543MB +[2023-10-12 20:49:25 simmim_pretrain](main_simmim.py 218): INFO Train: [138/200][3500/6787] eta 0:13:52 lr 0.000200 time 0.2534 (0.2532) loss 0.3387 (0.3599) grad_norm 131022.7344 (nan) mem 14543MB +[2023-10-12 20:51:31 simmim_pretrain](main_simmim.py 218): INFO Train: [138/200][4000/6787] eta 0:11:45 lr 0.000200 time 0.2502 (0.2532) loss 0.3445 (0.3601) grad_norm 184089.5625 (nan) mem 14543MB +[2023-10-12 20:53:38 simmim_pretrain](main_simmim.py 218): INFO Train: [138/200][4500/6787] eta 0:09:39 lr 0.000200 time 0.2529 (0.2532) loss 0.3495 (0.3600) grad_norm 177251.2500 (nan) mem 14543MB +[2023-10-12 20:55:45 simmim_pretrain](main_simmim.py 218): INFO Train: [138/200][5000/6787] eta 0:07:32 lr 0.000200 time 0.2519 (0.2532) loss 0.3626 (0.3599) grad_norm 149186.5156 (nan) mem 14543MB +[2023-10-12 20:57:52 simmim_pretrain](main_simmim.py 218): INFO Train: [138/200][5500/6787] eta 0:05:26 lr 0.000200 time 0.2509 (0.2534) loss 0.3609 (0.3597) grad_norm 278803.7500 (nan) mem 14543MB +[2023-10-12 21:00:02 simmim_pretrain](main_simmim.py 218): INFO Train: [138/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2610 (0.2539) loss 0.3561 (0.3597) grad_norm 409670.2812 (nan) mem 14543MB +[2023-10-12 21:02:11 simmim_pretrain](main_simmim.py 218): INFO Train: [138/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2540 (0.2541) loss 0.3453 (0.3595) grad_norm 335395.8125 (nan) mem 14543MB +[2023-10-12 21:03:24 simmim_pretrain](main_simmim.py 228): INFO EPOCH 138 training takes 0:28:45 +[2023-10-12 21:03:26 simmim_pretrain](main_simmim.py 218): INFO Train: [139/200][0/6787] eta 2:46:39 lr 0.000200 time 1.4733 (1.4733) loss 0.3603 (0.3603) grad_norm 736024.6250 (736024.6250) mem 14543MB +[2023-10-12 21:05:32 simmim_pretrain](main_simmim.py 218): INFO Train: [139/200][500/6787] eta 0:26:39 lr 0.000200 time 0.2537 (0.2544) loss 0.3327 (0.3573) grad_norm 310923.6250 (439042.5625) mem 14543MB +[2023-10-12 21:07:38 simmim_pretrain](main_simmim.py 218): INFO Train: [139/200][1000/6787] eta 0:24:25 lr 0.000200 time 0.2494 (0.2533) loss 0.3654 (0.3576) grad_norm 322528.1875 (412265.4375) mem 14543MB +[2023-10-12 21:09:44 simmim_pretrain](main_simmim.py 218): INFO Train: [139/200][1500/6787] eta 0:22:16 lr 0.000200 time 0.2509 (0.2528) loss 0.3567 (0.3575) grad_norm 560631.5000 (inf) mem 14543MB +[2023-10-12 21:11:50 simmim_pretrain](main_simmim.py 218): INFO Train: [139/200][2000/6787] eta 0:20:09 lr 0.000200 time 0.2499 (0.2527) loss 0.3597 (0.3575) grad_norm 237367.2812 (inf) mem 14543MB +[2023-10-12 21:13:57 simmim_pretrain](main_simmim.py 218): INFO Train: [139/200][2500/6787] eta 0:18:04 lr 0.000200 time 0.2570 (0.2529) loss 0.3483 (0.3573) grad_norm 364589.4688 (inf) mem 14543MB +[2023-10-12 21:16:03 simmim_pretrain](main_simmim.py 218): INFO Train: [139/200][3000/6787] eta 0:15:58 lr 0.000200 time 0.2557 (0.2530) loss 0.3291 (0.3573) grad_norm 521464.1562 (inf) mem 14543MB +[2023-10-12 21:18:10 simmim_pretrain](main_simmim.py 218): INFO Train: [139/200][3500/6787] eta 0:13:51 lr 0.000200 time 0.2513 (0.2530) loss 0.3619 (0.3571) grad_norm 638485.6875 (inf) mem 14543MB +[2023-10-12 21:20:17 simmim_pretrain](main_simmim.py 218): INFO Train: [139/200][4000/6787] eta 0:11:45 lr 0.000200 time 0.2657 (0.2531) loss 0.3526 (0.3572) grad_norm 305279.9688 (inf) mem 14543MB +[2023-10-12 21:22:23 simmim_pretrain](main_simmim.py 218): INFO Train: [139/200][4500/6787] eta 0:09:38 lr 0.000200 time 0.2503 (0.2531) loss 0.3542 (0.3574) grad_norm 303506.0625 (inf) mem 14543MB +[2023-10-12 21:24:31 simmim_pretrain](main_simmim.py 218): INFO Train: [139/200][5000/6787] eta 0:07:32 lr 0.000200 time 0.2553 (0.2532) loss 0.3674 (0.3576) grad_norm 318532.4688 (inf) mem 14543MB +[2023-10-12 21:26:38 simmim_pretrain](main_simmim.py 218): INFO Train: [139/200][5500/6787] eta 0:05:26 lr 0.000200 time 0.2555 (0.2534) loss 0.3693 (0.3577) grad_norm 354473.2500 (inf) mem 14543MB +[2023-10-12 21:28:45 simmim_pretrain](main_simmim.py 218): INFO Train: [139/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2523 (0.2535) loss 0.3707 (0.3576) grad_norm 181190.7031 (inf) mem 14543MB +[2023-10-12 21:30:53 simmim_pretrain](main_simmim.py 218): INFO Train: [139/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2554 (0.2536) loss 0.3820 (0.3577) grad_norm 196072.5469 (inf) mem 14543MB +[2023-10-12 21:32:06 simmim_pretrain](main_simmim.py 228): INFO EPOCH 139 training takes 0:28:41 +[2023-10-12 21:32:07 simmim_pretrain](main_simmim.py 218): INFO Train: [140/200][0/6787] eta 2:40:49 lr 0.000200 time 1.4217 (1.4217) loss 0.3671 (0.3671) grad_norm 157128.3750 (157128.3750) mem 14543MB +[2023-10-12 21:34:13 simmim_pretrain](main_simmim.py 218): INFO Train: [140/200][500/6787] eta 0:26:37 lr 0.000200 time 0.2471 (0.2542) loss 0.3690 (0.3567) grad_norm 192203.0625 (254308.3594) mem 14543MB +[2023-10-12 21:36:19 simmim_pretrain](main_simmim.py 218): INFO Train: [140/200][1000/6787] eta 0:24:25 lr 0.000200 time 0.2531 (0.2532) loss 0.3539 (0.3576) grad_norm 240057.7812 (254957.1875) mem 14543MB +[2023-10-12 21:38:25 simmim_pretrain](main_simmim.py 218): INFO Train: [140/200][1500/6787] eta 0:22:17 lr 0.000200 time 0.2517 (0.2530) loss 0.3649 (0.3578) grad_norm 353946.9062 (260324.5938) mem 14543MB +[2023-10-12 21:40:32 simmim_pretrain](main_simmim.py 218): INFO Train: [140/200][2000/6787] eta 0:20:11 lr 0.000200 time 0.2510 (0.2530) loss 0.3665 (0.3576) grad_norm 382478.8750 (287769.2500) mem 14543MB +[2023-10-12 21:42:39 simmim_pretrain](main_simmim.py 218): INFO Train: [140/200][2500/6787] eta 0:18:05 lr 0.000200 time 0.2489 (0.2532) loss 0.3504 (0.3575) grad_norm 515669.9375 (307327.9375) mem 14543MB +[2023-10-12 21:44:48 simmim_pretrain](main_simmim.py 218): INFO Train: [140/200][3000/6787] eta 0:16:02 lr 0.000200 time 0.2595 (0.2542) loss 0.3828 (0.3574) grad_norm 327587.2812 (327162.2812) mem 14543MB +[2023-10-12 21:46:58 simmim_pretrain](main_simmim.py 218): INFO Train: [140/200][3500/6787] eta 0:13:57 lr 0.000200 time 0.2578 (0.2549) loss 0.3503 (0.3573) grad_norm 603094.0000 (inf) mem 14543MB +[2023-10-12 21:49:07 simmim_pretrain](main_simmim.py 218): INFO Train: [140/200][4000/6787] eta 0:11:51 lr 0.000200 time 0.2594 (0.2554) loss 0.3414 (0.3574) grad_norm 283232.3750 (inf) mem 14543MB +[2023-10-12 21:51:17 simmim_pretrain](main_simmim.py 218): INFO Train: [140/200][4500/6787] eta 0:09:44 lr 0.000200 time 0.2598 (0.2558) loss 0.3740 (0.3576) grad_norm 196739.5625 (inf) mem 14543MB +[2023-10-12 21:53:26 simmim_pretrain](main_simmim.py 218): INFO Train: [140/200][5000/6787] eta 0:07:37 lr 0.000200 time 0.2593 (0.2561) loss 0.3762 (0.3578) grad_norm 139841.1562 (inf) mem 14543MB +[2023-10-12 21:55:36 simmim_pretrain](main_simmim.py 218): INFO Train: [140/200][5500/6787] eta 0:05:29 lr 0.000200 time 0.2584 (0.2564) loss 0.3438 (0.3578) grad_norm 201952.2031 (inf) mem 14543MB +[2023-10-12 21:57:46 simmim_pretrain](main_simmim.py 218): INFO Train: [140/200][6000/6787] eta 0:03:21 lr 0.000200 time 0.2593 (0.2566) loss 0.3781 (0.3579) grad_norm 180120.3438 (inf) mem 14543MB +[2023-10-12 21:59:55 simmim_pretrain](main_simmim.py 218): INFO Train: [140/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2543 (0.2568) loss 0.3643 (0.3578) grad_norm 331432.2188 (inf) mem 14543MB +[2023-10-12 22:01:10 simmim_pretrain](main_simmim.py 228): INFO EPOCH 140 training takes 0:29:04 +[2023-10-12 22:01:10 simmim_pretrain](utils.py 62): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_140.pth saving...... +[2023-10-12 22:01:10 simmim_pretrain](utils.py 64): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_140.pth saved !!! +[2023-10-12 22:01:12 simmim_pretrain](main_simmim.py 218): INFO Train: [141/200][0/6787] eta 2:29:52 lr 0.000200 time 1.3249 (1.3249) loss 0.3513 (0.3513) grad_norm 452105.2188 (452105.2188) mem 14543MB +[2023-10-12 22:03:17 simmim_pretrain](main_simmim.py 218): INFO Train: [141/200][500/6787] eta 0:26:34 lr 0.000200 time 0.2544 (0.2536) loss 0.3574 (0.3564) grad_norm 383170.1250 (436252.9375) mem 14543MB +[2023-10-12 22:05:24 simmim_pretrain](main_simmim.py 218): INFO Train: [141/200][1000/6787] eta 0:24:24 lr 0.000200 time 0.2506 (0.2530) loss 0.3442 (0.3569) grad_norm 657144.2500 (inf) mem 14543MB +[2023-10-12 22:07:31 simmim_pretrain](main_simmim.py 218): INFO Train: [141/200][1500/6787] eta 0:22:19 lr 0.000200 time 0.2561 (0.2534) loss 0.3572 (0.3573) grad_norm 566559.1875 (inf) mem 14543MB +[2023-10-12 22:09:38 simmim_pretrain](main_simmim.py 218): INFO Train: [141/200][2000/6787] eta 0:20:14 lr 0.000200 time 0.2530 (0.2536) loss 0.3593 (0.3575) grad_norm 276387.7188 (inf) mem 14543MB +[2023-10-12 22:11:45 simmim_pretrain](main_simmim.py 218): INFO Train: [141/200][2500/6787] eta 0:18:07 lr 0.000200 time 0.2483 (0.2536) loss 0.5117 (0.3687) grad_norm 11041.0029 (inf) mem 14543MB +[2023-10-12 22:13:51 simmim_pretrain](main_simmim.py 218): INFO Train: [141/200][3000/6787] eta 0:15:59 lr 0.000200 time 0.2501 (0.2535) loss 0.4524 (0.3900) grad_norm 46830.1914 (inf) mem 14543MB +[2023-10-12 22:15:57 simmim_pretrain](main_simmim.py 218): INFO Train: [141/200][3500/6787] eta 0:13:52 lr 0.000200 time 0.2474 (0.2533) loss 0.3939 (0.3922) grad_norm 19839.7910 (inf) mem 14543MB +[2023-10-12 22:18:03 simmim_pretrain](main_simmim.py 218): INFO Train: [141/200][4000/6787] eta 0:11:45 lr 0.000200 time 0.2490 (0.2532) loss 0.3741 (0.3896) grad_norm 43657.7344 (inf) mem 14543MB +[2023-10-12 22:20:10 simmim_pretrain](main_simmim.py 218): INFO Train: [141/200][4500/6787] eta 0:09:39 lr 0.000200 time 0.2528 (0.2532) loss 0.3706 (0.3872) grad_norm 31653.1074 (inf) mem 14543MB +[2023-10-12 22:22:17 simmim_pretrain](main_simmim.py 218): INFO Train: [141/200][5000/6787] eta 0:07:32 lr 0.000200 time 0.2465 (0.2533) loss 0.3519 (0.3848) grad_norm 17131.6387 (inf) mem 14543MB +[2023-10-12 22:24:24 simmim_pretrain](main_simmim.py 218): INFO Train: [141/200][5500/6787] eta 0:05:26 lr 0.000200 time 0.2509 (0.2534) loss 0.3499 (0.3827) grad_norm 70889.2344 (inf) mem 14543MB +[2023-10-12 22:26:31 simmim_pretrain](main_simmim.py 218): INFO Train: [141/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2538 (0.2534) loss 0.3450 (0.3811) grad_norm 66766.6719 (inf) mem 14543MB +[2023-10-12 22:28:37 simmim_pretrain](main_simmim.py 218): INFO Train: [141/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2506 (0.2533) loss 0.3536 (0.3795) grad_norm 75082.8594 (inf) mem 14543MB +[2023-10-12 22:29:50 simmim_pretrain](main_simmim.py 228): INFO EPOCH 141 training takes 0:28:39 +[2023-10-12 22:29:51 simmim_pretrain](main_simmim.py 218): INFO Train: [142/200][0/6787] eta 2:48:04 lr 0.000200 time 1.4858 (1.4858) loss 0.3555 (0.3555) grad_norm 75042.5000 (75042.5000) mem 14543MB +[2023-10-12 22:31:57 simmim_pretrain](main_simmim.py 218): INFO Train: [142/200][500/6787] eta 0:26:41 lr 0.000200 time 0.2517 (0.2547) loss 0.3514 (0.3607) grad_norm 86339.3516 (78284.5312) mem 14543MB +[2023-10-12 22:34:04 simmim_pretrain](main_simmim.py 218): INFO Train: [142/200][1000/6787] eta 0:24:29 lr 0.000200 time 0.2458 (0.2539) loss 0.3757 (0.3603) grad_norm 103156.4844 (83900.3047) mem 14543MB +[2023-10-12 22:36:11 simmim_pretrain](main_simmim.py 218): INFO Train: [142/200][1500/6787] eta 0:22:21 lr 0.000200 time 0.2519 (0.2538) loss 0.3639 (0.3598) grad_norm 130637.8750 (88242.4922) mem 14543MB +[2023-10-12 22:38:18 simmim_pretrain](main_simmim.py 218): INFO Train: [142/200][2000/6787] eta 0:20:15 lr 0.000200 time 0.2516 (0.2539) loss 0.3417 (0.3596) grad_norm 117280.4922 (99600.8750) mem 14543MB +[2023-10-12 22:40:25 simmim_pretrain](main_simmim.py 218): INFO Train: [142/200][2500/6787] eta 0:18:08 lr 0.000200 time 0.2543 (0.2539) loss 0.3656 (0.3595) grad_norm 88122.8984 (111333.5781) mem 14543MB +[2023-10-12 22:42:32 simmim_pretrain](main_simmim.py 218): INFO Train: [142/200][3000/6787] eta 0:16:01 lr 0.000200 time 0.2557 (0.2538) loss 0.3608 (0.3592) grad_norm 88883.7188 (119312.1406) mem 14543MB +[2023-10-12 22:44:38 simmim_pretrain](main_simmim.py 218): INFO Train: [142/200][3500/6787] eta 0:13:54 lr 0.000200 time 0.2463 (0.2538) loss 0.3463 (0.3590) grad_norm 202045.5469 (128418.7656) mem 14543MB +[2023-10-12 22:46:45 simmim_pretrain](main_simmim.py 218): INFO Train: [142/200][4000/6787] eta 0:11:47 lr 0.000200 time 0.2467 (0.2537) loss 0.3541 (0.3588) grad_norm 226299.7969 (inf) mem 14543MB +[2023-10-12 22:48:52 simmim_pretrain](main_simmim.py 218): INFO Train: [142/200][4500/6787] eta 0:09:40 lr 0.000200 time 0.2568 (0.2537) loss 0.3487 (0.3588) grad_norm 152624.7188 (inf) mem 14543MB +[2023-10-12 22:50:59 simmim_pretrain](main_simmim.py 218): INFO Train: [142/200][5000/6787] eta 0:07:33 lr 0.000200 time 0.2508 (0.2537) loss 0.3639 (0.3586) grad_norm 95081.8828 (inf) mem 14543MB +[2023-10-12 22:53:05 simmim_pretrain](main_simmim.py 218): INFO Train: [142/200][5500/6787] eta 0:05:26 lr 0.000200 time 0.2568 (0.2536) loss 0.3787 (0.3586) grad_norm 107240.5078 (inf) mem 14543MB +[2023-10-12 22:55:12 simmim_pretrain](main_simmim.py 218): INFO Train: [142/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2579 (0.2536) loss 0.3574 (0.3586) grad_norm 327172.4375 (inf) mem 14543MB +[2023-10-12 22:57:18 simmim_pretrain](main_simmim.py 218): INFO Train: [142/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2532 (0.2535) loss 0.3595 (0.3585) grad_norm 257903.5781 (inf) mem 14543MB +[2023-10-12 22:58:31 simmim_pretrain](main_simmim.py 228): INFO EPOCH 142 training takes 0:28:41 +[2023-10-12 22:58:33 simmim_pretrain](main_simmim.py 218): INFO Train: [143/200][0/6787] eta 2:40:52 lr 0.000200 time 1.4222 (1.4222) loss 0.3443 (0.3443) grad_norm 464211.7500 (464211.7500) mem 14543MB +[2023-10-12 23:00:39 simmim_pretrain](main_simmim.py 218): INFO Train: [143/200][500/6787] eta 0:26:46 lr 0.000200 time 0.2545 (0.2555) loss 0.3399 (0.3560) grad_norm 304563.1875 (465708.3438) mem 14543MB +[2023-10-12 23:02:46 simmim_pretrain](main_simmim.py 218): INFO Train: [143/200][1000/6787] eta 0:24:36 lr 0.000200 time 0.2539 (0.2551) loss 0.3509 (0.3561) grad_norm 548714.6875 (inf) mem 14543MB +[2023-10-12 23:04:54 simmim_pretrain](main_simmim.py 218): INFO Train: [143/200][1500/6787] eta 0:22:29 lr 0.000200 time 0.2546 (0.2552) loss 0.3524 (0.3564) grad_norm 382324.8438 (inf) mem 14543MB +[2023-10-12 23:07:02 simmim_pretrain](main_simmim.py 218): INFO Train: [143/200][2000/6787] eta 0:20:21 lr 0.000200 time 0.2548 (0.2552) loss 0.3582 (0.3566) grad_norm 320679.6562 (inf) mem 14543MB +[2023-10-12 23:09:09 simmim_pretrain](main_simmim.py 218): INFO Train: [143/200][2500/6787] eta 0:18:13 lr 0.000200 time 0.2533 (0.2550) loss 0.3501 (0.3564) grad_norm 537145.4375 (inf) mem 14543MB +[2023-10-12 23:11:16 simmim_pretrain](main_simmim.py 218): INFO Train: [143/200][3000/6787] eta 0:16:05 lr 0.000200 time 0.2534 (0.2550) loss 0.3806 (0.3565) grad_norm 490226.4375 (inf) mem 14543MB +[2023-10-12 23:13:24 simmim_pretrain](main_simmim.py 218): INFO Train: [143/200][3500/6787] eta 0:13:58 lr 0.000200 time 0.2564 (0.2550) loss 0.3742 (0.3567) grad_norm 154836.9844 (inf) mem 14543MB +[2023-10-12 23:15:31 simmim_pretrain](main_simmim.py 218): INFO Train: [143/200][4000/6787] eta 0:11:50 lr 0.000200 time 0.2526 (0.2550) loss 0.3631 (0.3570) grad_norm 210106.0469 (inf) mem 14543MB +[2023-10-12 23:17:39 simmim_pretrain](main_simmim.py 218): INFO Train: [143/200][4500/6787] eta 0:09:43 lr 0.000200 time 0.2539 (0.2550) loss 0.3453 (0.3571) grad_norm 184100.8281 (inf) mem 14543MB +[2023-10-12 23:19:47 simmim_pretrain](main_simmim.py 218): INFO Train: [143/200][5000/6787] eta 0:07:35 lr 0.000200 time 0.2538 (0.2550) loss 0.3572 (0.3572) grad_norm 161449.7344 (inf) mem 14543MB +[2023-10-12 23:21:54 simmim_pretrain](main_simmim.py 218): INFO Train: [143/200][5500/6787] eta 0:05:28 lr 0.000200 time 0.2527 (0.2551) loss 0.3663 (0.3573) grad_norm 146782.6094 (inf) mem 14543MB +[2023-10-12 23:24:02 simmim_pretrain](main_simmim.py 218): INFO Train: [143/200][6000/6787] eta 0:03:20 lr 0.000200 time 0.2594 (0.2551) loss 0.3601 (0.3573) grad_norm 251937.4844 (inf) mem 14543MB +[2023-10-12 23:26:10 simmim_pretrain](main_simmim.py 218): INFO Train: [143/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2529 (0.2551) loss 0.3617 (0.3572) grad_norm 499877.8125 (inf) mem 14543MB +[2023-10-12 23:27:24 simmim_pretrain](main_simmim.py 228): INFO EPOCH 143 training takes 0:28:52 +[2023-10-12 23:27:25 simmim_pretrain](main_simmim.py 218): INFO Train: [144/200][0/6787] eta 2:55:46 lr 0.000200 time 1.5539 (1.5539) loss 0.3760 (0.3760) grad_norm 382667.5312 (382667.5312) mem 14543MB +[2023-10-12 23:29:32 simmim_pretrain](main_simmim.py 218): INFO Train: [144/200][500/6787] eta 0:26:53 lr 0.000200 time 0.2544 (0.2566) loss 0.3574 (0.3577) grad_norm 399868.3438 (359994.4375) mem 14543MB +[2023-10-12 23:31:39 simmim_pretrain](main_simmim.py 218): INFO Train: [144/200][1000/6787] eta 0:24:38 lr 0.000200 time 0.2528 (0.2554) loss 0.3389 (0.3571) grad_norm 420865.2188 (inf) mem 14543MB +[2023-10-12 23:33:46 simmim_pretrain](main_simmim.py 218): INFO Train: [144/200][1500/6787] eta 0:22:26 lr 0.000200 time 0.2531 (0.2547) loss 0.3485 (0.3570) grad_norm 265816.0312 (inf) mem 14543MB +[2023-10-12 23:35:52 simmim_pretrain](main_simmim.py 218): INFO Train: [144/200][2000/6787] eta 0:20:15 lr 0.000200 time 0.2489 (0.2540) loss 0.3503 (0.3573) grad_norm 259058.4688 (inf) mem 14543MB +[2023-10-12 23:37:58 simmim_pretrain](main_simmim.py 218): INFO Train: [144/200][2500/6787] eta 0:18:07 lr 0.000200 time 0.2507 (0.2536) loss 0.3535 (0.3578) grad_norm 268508.8438 (inf) mem 14543MB +[2023-10-12 23:40:04 simmim_pretrain](main_simmim.py 218): INFO Train: [144/200][3000/6787] eta 0:15:59 lr 0.000200 time 0.2594 (0.2533) loss 0.3469 (0.3579) grad_norm 97643.1797 (inf) mem 14543MB +[2023-10-12 23:42:10 simmim_pretrain](main_simmim.py 218): INFO Train: [144/200][3500/6787] eta 0:13:52 lr 0.000200 time 0.2553 (0.2533) loss 0.3552 (0.3580) grad_norm 293765.2500 (inf) mem 14543MB +[2023-10-12 23:44:18 simmim_pretrain](main_simmim.py 218): INFO Train: [144/200][4000/6787] eta 0:11:46 lr 0.000200 time 0.2609 (0.2537) loss 0.3665 (0.3579) grad_norm 580131.0000 (inf) mem 14543MB +[2023-10-12 23:46:29 simmim_pretrain](main_simmim.py 218): INFO Train: [144/200][4500/6787] eta 0:09:41 lr 0.000200 time 0.2608 (0.2544) loss 0.3280 (0.3577) grad_norm 352433.3750 (inf) mem 14543MB +[2023-10-12 23:48:39 simmim_pretrain](main_simmim.py 218): INFO Train: [144/200][5000/6787] eta 0:07:35 lr 0.000200 time 0.2608 (0.2550) loss 0.3488 (0.3577) grad_norm 380837.4375 (inf) mem 14543MB +[2023-10-12 23:50:49 simmim_pretrain](main_simmim.py 218): INFO Train: [144/200][5500/6787] eta 0:05:28 lr 0.000200 time 0.2605 (0.2554) loss 0.3520 (0.3576) grad_norm 281264.0625 (inf) mem 14543MB +[2023-10-12 23:52:59 simmim_pretrain](main_simmim.py 218): INFO Train: [144/200][6000/6787] eta 0:03:21 lr 0.000200 time 0.2610 (0.2558) loss 0.3663 (0.3576) grad_norm 465062.7812 (inf) mem 14543MB +[2023-10-12 23:55:08 simmim_pretrain](main_simmim.py 218): INFO Train: [144/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2609 (0.2561) loss 0.3539 (0.3575) grad_norm 332526.3125 (inf) mem 14543MB +[2023-10-12 23:56:24 simmim_pretrain](main_simmim.py 228): INFO EPOCH 144 training takes 0:29:00 +[2023-10-12 23:56:25 simmim_pretrain](main_simmim.py 218): INFO Train: [145/200][0/6787] eta 2:31:31 lr 0.000200 time 1.3395 (1.3395) loss 0.3585 (0.3585) grad_norm 509315.8750 (509315.8750) mem 14543MB +[2023-10-12 23:58:32 simmim_pretrain](main_simmim.py 218): INFO Train: [145/200][500/6787] eta 0:26:50 lr 0.000200 time 0.2547 (0.2562) loss 0.3538 (0.3584) grad_norm 287250.8125 (284358.6250) mem 14543MB +[2023-10-13 00:00:39 simmim_pretrain](main_simmim.py 218): INFO Train: [145/200][1000/6787] eta 0:24:36 lr 0.000200 time 0.2563 (0.2552) loss 0.3738 (0.3582) grad_norm 237128.4688 (263038.1250) mem 14543MB +[2023-10-13 00:02:46 simmim_pretrain](main_simmim.py 218): INFO Train: [145/200][1500/6787] eta 0:22:26 lr 0.000200 time 0.2501 (0.2547) loss 0.3392 (0.3583) grad_norm 278416.2812 (254640.7812) mem 14543MB +[2023-10-13 00:04:52 simmim_pretrain](main_simmim.py 218): INFO Train: [145/200][2000/6787] eta 0:20:16 lr 0.000200 time 0.2503 (0.2542) loss 0.3660 (0.3582) grad_norm 219032.8281 (250165.9375) mem 14543MB +[2023-10-13 00:07:00 simmim_pretrain](main_simmim.py 218): INFO Train: [145/200][2500/6787] eta 0:18:10 lr 0.000200 time 0.2539 (0.2543) loss 0.3536 (0.3581) grad_norm 386238.7500 (259889.0312) mem 14543MB +[2023-10-13 00:09:08 simmim_pretrain](main_simmim.py 218): INFO Train: [145/200][3000/6787] eta 0:16:04 lr 0.000200 time 0.2578 (0.2548) loss 0.3443 (0.3580) grad_norm 484961.1875 (272865.6875) mem 14543MB +[2023-10-13 00:11:17 simmim_pretrain](main_simmim.py 218): INFO Train: [145/200][3500/6787] eta 0:13:59 lr 0.000200 time 0.2583 (0.2553) loss 0.3552 (0.3579) grad_norm 369376.4062 (288776.8438) mem 14543MB +[2023-10-13 00:13:26 simmim_pretrain](main_simmim.py 218): INFO Train: [145/200][4000/6787] eta 0:11:51 lr 0.000200 time 0.2512 (0.2554) loss 0.3522 (0.3577) grad_norm 483793.6875 (305521.8750) mem 14543MB +[2023-10-13 00:15:33 simmim_pretrain](main_simmim.py 218): INFO Train: [145/200][4500/6787] eta 0:09:44 lr 0.000200 time 0.2519 (0.2554) loss 0.3724 (0.3575) grad_norm 238602.7344 (inf) mem 14543MB +[2023-10-13 00:17:40 simmim_pretrain](main_simmim.py 218): INFO Train: [145/200][5000/6787] eta 0:07:36 lr 0.000200 time 0.2550 (0.2552) loss 0.3570 (0.3575) grad_norm 505789.1250 (inf) mem 14543MB +[2023-10-13 00:19:46 simmim_pretrain](main_simmim.py 218): INFO Train: [145/200][5500/6787] eta 0:05:28 lr 0.000200 time 0.2560 (0.2550) loss 0.3412 (0.3575) grad_norm 592765.0625 (inf) mem 14543MB +[2023-10-13 00:21:53 simmim_pretrain](main_simmim.py 218): INFO Train: [145/200][6000/6787] eta 0:03:20 lr 0.000200 time 0.2507 (0.2549) loss 0.3711 (0.3574) grad_norm 445956.4062 (inf) mem 14543MB +[2023-10-13 00:24:00 simmim_pretrain](main_simmim.py 218): INFO Train: [145/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2509 (0.2547) loss 0.3392 (0.3573) grad_norm 527869.8750 (inf) mem 14543MB +[2023-10-13 00:25:13 simmim_pretrain](main_simmim.py 228): INFO EPOCH 145 training takes 0:28:49 +[2023-10-13 00:25:14 simmim_pretrain](main_simmim.py 218): INFO Train: [146/200][0/6787] eta 2:36:26 lr 0.000200 time 1.3830 (1.3830) loss 0.3692 (0.3692) grad_norm 434800.2188 (434800.2188) mem 14543MB +[2023-10-13 00:27:21 simmim_pretrain](main_simmim.py 218): INFO Train: [146/200][500/6787] eta 0:26:43 lr 0.000200 time 0.2590 (0.2551) loss 0.3455 (0.3570) grad_norm 322820.2812 (inf) mem 14543MB +[2023-10-13 00:29:28 simmim_pretrain](main_simmim.py 218): INFO Train: [146/200][1000/6787] eta 0:24:34 lr 0.000200 time 0.2535 (0.2548) loss 0.3414 (0.3579) grad_norm 262320.1250 (inf) mem 14543MB +[2023-10-13 00:31:36 simmim_pretrain](main_simmim.py 218): INFO Train: [146/200][1500/6787] eta 0:22:29 lr 0.000200 time 0.2562 (0.2553) loss 0.3635 (0.3580) grad_norm 217056.8594 (inf) mem 14543MB +[2023-10-13 00:33:44 simmim_pretrain](main_simmim.py 218): INFO Train: [146/200][2000/6787] eta 0:20:22 lr 0.000200 time 0.2578 (0.2555) loss 0.3396 (0.3592) grad_norm 239126.0156 (inf) mem 14543MB +[2023-10-13 00:35:52 simmim_pretrain](main_simmim.py 218): INFO Train: [146/200][2500/6787] eta 0:18:15 lr 0.000200 time 0.2590 (0.2556) loss 0.3515 (0.3598) grad_norm 152799.5469 (inf) mem 14543MB +[2023-10-13 00:38:00 simmim_pretrain](main_simmim.py 218): INFO Train: [146/200][3000/6787] eta 0:16:07 lr 0.000200 time 0.2518 (0.2556) loss 0.3935 (0.3599) grad_norm 199040.6406 (inf) mem 14543MB +[2023-10-13 00:40:08 simmim_pretrain](main_simmim.py 218): INFO Train: [146/200][3500/6787] eta 0:13:59 lr 0.000200 time 0.2534 (0.2555) loss 0.3694 (0.3600) grad_norm 83091.8125 (inf) mem 14543MB +[2023-10-13 00:42:15 simmim_pretrain](main_simmim.py 218): INFO Train: [146/200][4000/6787] eta 0:11:51 lr 0.000200 time 0.2572 (0.2553) loss 0.3343 (0.3599) grad_norm 226849.3594 (inf) mem 14543MB +[2023-10-13 00:44:22 simmim_pretrain](main_simmim.py 218): INFO Train: [146/200][4500/6787] eta 0:09:43 lr 0.000200 time 0.2524 (0.2552) loss 0.3787 (0.3598) grad_norm 216481.7031 (inf) mem 14543MB +[2023-10-13 00:46:31 simmim_pretrain](main_simmim.py 218): INFO Train: [146/200][5000/6787] eta 0:07:36 lr 0.000200 time 0.2609 (0.2555) loss 0.3484 (0.3597) grad_norm 331799.2812 (inf) mem 14543MB +[2023-10-13 00:48:41 simmim_pretrain](main_simmim.py 218): INFO Train: [146/200][5500/6787] eta 0:05:29 lr 0.000200 time 0.2606 (0.2559) loss 0.3624 (0.3596) grad_norm 204233.0156 (inf) mem 14543MB +[2023-10-13 00:50:51 simmim_pretrain](main_simmim.py 218): INFO Train: [146/200][6000/6787] eta 0:03:21 lr 0.000200 time 0.2600 (0.2562) loss 0.3379 (0.3595) grad_norm 330938.0312 (inf) mem 14543MB +[2023-10-13 00:53:01 simmim_pretrain](main_simmim.py 218): INFO Train: [146/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2600 (0.2565) loss 0.3557 (0.3593) grad_norm 264080.6875 (inf) mem 14543MB +[2023-10-13 00:54:16 simmim_pretrain](main_simmim.py 228): INFO EPOCH 146 training takes 0:29:02 +[2023-10-13 00:54:17 simmim_pretrain](main_simmim.py 218): INFO Train: [147/200][0/6787] eta 2:47:20 lr 0.000200 time 1.4794 (1.4794) loss 0.3573 (0.3573) grad_norm 211475.4219 (211475.4219) mem 14543MB +[2023-10-13 00:56:23 simmim_pretrain](main_simmim.py 218): INFO Train: [147/200][500/6787] eta 0:26:42 lr 0.000200 time 0.2493 (0.2550) loss 0.3644 (0.3586) grad_norm 169592.3281 (241175.1250) mem 14543MB +[2023-10-13 00:58:31 simmim_pretrain](main_simmim.py 218): INFO Train: [147/200][1000/6787] eta 0:24:39 lr 0.000200 time 0.2589 (0.2556) loss 0.3665 (0.3588) grad_norm 347051.6250 (238049.0312) mem 14543MB +[2023-10-13 01:00:40 simmim_pretrain](main_simmim.py 218): INFO Train: [147/200][1500/6787] eta 0:22:35 lr 0.000200 time 0.2580 (0.2564) loss 0.3511 (0.3585) grad_norm 104975.1953 (239609.1875) mem 14543MB +[2023-10-13 01:02:49 simmim_pretrain](main_simmim.py 218): INFO Train: [147/200][2000/6787] eta 0:20:29 lr 0.000200 time 0.2590 (0.2569) loss 0.3505 (0.3582) grad_norm 294382.8438 (253863.4062) mem 14543MB +[2023-10-13 01:04:57 simmim_pretrain](main_simmim.py 218): INFO Train: [147/200][2500/6787] eta 0:18:20 lr 0.000200 time 0.2559 (0.2566) loss 0.3512 (0.3579) grad_norm 201535.5469 (288262.2812) mem 14543MB +[2023-10-13 01:07:06 simmim_pretrain](main_simmim.py 218): INFO Train: [147/200][3000/6787] eta 0:16:11 lr 0.000200 time 0.2601 (0.2566) loss 0.3461 (0.3578) grad_norm 273209.3750 (303969.1875) mem 14543MB +[2023-10-13 01:09:14 simmim_pretrain](main_simmim.py 218): INFO Train: [147/200][3500/6787] eta 0:14:03 lr 0.000200 time 0.2607 (0.2567) loss 0.3483 (0.3577) grad_norm 423470.6875 (324463.2188) mem 14543MB +[2023-10-13 01:11:23 simmim_pretrain](main_simmim.py 218): INFO Train: [147/200][4000/6787] eta 0:11:55 lr 0.000200 time 0.2514 (0.2568) loss 0.3388 (0.3575) grad_norm 347963.3750 (inf) mem 14543MB +[2023-10-13 01:13:31 simmim_pretrain](main_simmim.py 218): INFO Train: [147/200][4500/6787] eta 0:09:47 lr 0.000200 time 0.2540 (0.2568) loss 0.3642 (0.3575) grad_norm 427354.7812 (inf) mem 14543MB +[2023-10-13 01:15:40 simmim_pretrain](main_simmim.py 218): INFO Train: [147/200][5000/6787] eta 0:07:38 lr 0.000200 time 0.2533 (0.2568) loss 0.3419 (0.3575) grad_norm 284245.1875 (inf) mem 14543MB +[2023-10-13 01:17:48 simmim_pretrain](main_simmim.py 218): INFO Train: [147/200][5500/6787] eta 0:05:30 lr 0.000200 time 0.2591 (0.2569) loss 0.3724 (0.3575) grad_norm 244469.0625 (inf) mem 14543MB +[2023-10-13 01:19:57 simmim_pretrain](main_simmim.py 218): INFO Train: [147/200][6000/6787] eta 0:03:22 lr 0.000200 time 0.2493 (0.2569) loss 0.3375 (0.3577) grad_norm 268601.1250 (inf) mem 14543MB +[2023-10-13 01:22:06 simmim_pretrain](main_simmim.py 218): INFO Train: [147/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.3587 (0.2570) loss 0.3700 (0.3577) grad_norm 122887.5938 (inf) mem 14543MB +[2023-10-13 01:23:20 simmim_pretrain](main_simmim.py 228): INFO EPOCH 147 training takes 0:29:04 +[2023-10-13 01:23:22 simmim_pretrain](main_simmim.py 218): INFO Train: [148/200][0/6787] eta 2:46:19 lr 0.000200 time 1.4704 (1.4704) loss 0.3338 (0.3338) grad_norm 357979.3438 (357979.3438) mem 14543MB +[2023-10-13 01:25:31 simmim_pretrain](main_simmim.py 218): INFO Train: [148/200][500/6787] eta 0:27:19 lr 0.000200 time 0.2596 (0.2608) loss 0.3456 (0.3572) grad_norm 272372.8438 (292792.9688) mem 14543MB +[2023-10-13 01:27:40 simmim_pretrain](main_simmim.py 218): INFO Train: [148/200][1000/6787] eta 0:24:58 lr 0.000200 time 0.2570 (0.2589) loss 0.3517 (0.3571) grad_norm 393587.7188 (340455.0625) mem 14543MB +[2023-10-13 01:29:48 simmim_pretrain](main_simmim.py 218): INFO Train: [148/200][1500/6787] eta 0:22:44 lr 0.000200 time 0.2497 (0.2581) loss 0.3650 (0.3570) grad_norm 427314.7812 (355725.2188) mem 14543MB +[2023-10-13 01:31:56 simmim_pretrain](main_simmim.py 218): INFO Train: [148/200][2000/6787] eta 0:20:33 lr 0.000200 time 0.2544 (0.2576) loss 0.3522 (0.3575) grad_norm 302286.1250 (inf) mem 14543MB +[2023-10-13 01:34:04 simmim_pretrain](main_simmim.py 218): INFO Train: [148/200][2500/6787] eta 0:18:23 lr 0.000200 time 0.2574 (0.2575) loss 0.3650 (0.3576) grad_norm 244324.5469 (inf) mem 14543MB +[2023-10-13 01:36:13 simmim_pretrain](main_simmim.py 218): INFO Train: [148/200][3000/6787] eta 0:16:15 lr 0.000200 time 0.2611 (0.2575) loss 0.3645 (0.3578) grad_norm 247005.0469 (inf) mem 14543MB +[2023-10-13 01:38:22 simmim_pretrain](main_simmim.py 218): INFO Train: [148/200][3500/6787] eta 0:14:06 lr 0.000200 time 0.2580 (0.2574) loss 0.3812 (0.3579) grad_norm 253302.4375 (inf) mem 14543MB +[2023-10-13 01:40:30 simmim_pretrain](main_simmim.py 218): INFO Train: [148/200][4000/6787] eta 0:11:57 lr 0.000200 time 0.2500 (0.2574) loss 0.3407 (0.3579) grad_norm 297803.8438 (inf) mem 14543MB +[2023-10-13 01:42:39 simmim_pretrain](main_simmim.py 218): INFO Train: [148/200][4500/6787] eta 0:09:48 lr 0.000200 time 0.2545 (0.2574) loss 0.3601 (0.3578) grad_norm 430838.8750 (inf) mem 14543MB +[2023-10-13 01:44:48 simmim_pretrain](main_simmim.py 218): INFO Train: [148/200][5000/6787] eta 0:07:39 lr 0.000200 time 0.2530 (0.2574) loss 0.3491 (0.3577) grad_norm 333429.0312 (inf) mem 14543MB +[2023-10-13 01:46:56 simmim_pretrain](main_simmim.py 218): INFO Train: [148/200][5500/6787] eta 0:05:31 lr 0.000200 time 0.2458 (0.2573) loss 0.3570 (0.3576) grad_norm 301454.5625 (inf) mem 14543MB +[2023-10-13 01:49:04 simmim_pretrain](main_simmim.py 218): INFO Train: [148/200][6000/6787] eta 0:03:22 lr 0.000200 time 0.2539 (0.2573) loss 0.3670 (0.3576) grad_norm 555655.0000 (inf) mem 14543MB +[2023-10-13 01:51:13 simmim_pretrain](main_simmim.py 218): INFO Train: [148/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2547 (0.2573) loss 0.3523 (0.3577) grad_norm 292190.5312 (inf) mem 14543MB +[2023-10-13 01:52:27 simmim_pretrain](main_simmim.py 228): INFO EPOCH 148 training takes 0:29:06 +[2023-10-13 01:52:28 simmim_pretrain](main_simmim.py 218): INFO Train: [149/200][0/6787] eta 2:40:08 lr 0.000200 time 1.4157 (1.4157) loss 0.3588 (0.3588) grad_norm 403019.2812 (403019.2812) mem 14543MB +[2023-10-13 01:54:37 simmim_pretrain](main_simmim.py 218): INFO Train: [149/200][500/6787] eta 0:27:05 lr 0.000200 time 0.2530 (0.2586) loss 0.3609 (0.3600) grad_norm 267265.0938 (271737.1562) mem 14543MB +[2023-10-13 01:56:45 simmim_pretrain](main_simmim.py 218): INFO Train: [149/200][1000/6787] eta 0:24:52 lr 0.000200 time 0.2602 (0.2578) loss 0.3482 (0.3591) grad_norm 298225.5938 (267084.9688) mem 14543MB +[2023-10-13 01:58:54 simmim_pretrain](main_simmim.py 218): INFO Train: [149/200][1500/6787] eta 0:22:42 lr 0.000200 time 0.2572 (0.2577) loss 0.3622 (0.3593) grad_norm 224466.4375 (264735.8750) mem 14543MB +[2023-10-13 02:01:03 simmim_pretrain](main_simmim.py 218): INFO Train: [149/200][2000/6787] eta 0:20:33 lr 0.000200 time 0.2571 (0.2577) loss 0.3543 (0.3589) grad_norm 395137.3750 (278995.8750) mem 14543MB +[2023-10-13 02:03:11 simmim_pretrain](main_simmim.py 218): INFO Train: [149/200][2500/6787] eta 0:18:24 lr 0.000200 time 0.2559 (0.2576) loss 0.3610 (0.3583) grad_norm 357110.0000 (301106.0000) mem 14543MB +[2023-10-13 02:05:20 simmim_pretrain](main_simmim.py 218): INFO Train: [149/200][3000/6787] eta 0:16:15 lr 0.000200 time 0.2608 (0.2575) loss 0.3617 (0.3583) grad_norm 395157.3750 (inf) mem 14543MB +[2023-10-13 02:07:28 simmim_pretrain](main_simmim.py 218): INFO Train: [149/200][3500/6787] eta 0:14:06 lr 0.000200 time 0.2594 (0.2574) loss 0.3586 (0.3583) grad_norm 382023.6875 (inf) mem 14543MB +[2023-10-13 02:09:37 simmim_pretrain](main_simmim.py 218): INFO Train: [149/200][4000/6787] eta 0:11:57 lr 0.000200 time 0.2587 (0.2573) loss 0.3418 (0.3584) grad_norm 241769.4062 (inf) mem 14543MB +[2023-10-13 02:11:45 simmim_pretrain](main_simmim.py 218): INFO Train: [149/200][4500/6787] eta 0:09:48 lr 0.000200 time 0.2569 (0.2573) loss 0.3659 (0.3585) grad_norm 384938.4688 (inf) mem 14543MB +[2023-10-13 02:13:54 simmim_pretrain](main_simmim.py 218): INFO Train: [149/200][5000/6787] eta 0:07:39 lr 0.000200 time 0.2503 (0.2573) loss 0.3620 (0.3584) grad_norm 189740.0469 (inf) mem 14543MB +[2023-10-13 02:16:03 simmim_pretrain](main_simmim.py 218): INFO Train: [149/200][5500/6787] eta 0:05:31 lr 0.000200 time 0.2563 (0.2573) loss 0.3393 (0.3583) grad_norm 446399.0312 (inf) mem 14543MB +[2023-10-13 02:18:11 simmim_pretrain](main_simmim.py 218): INFO Train: [149/200][6000/6787] eta 0:03:22 lr 0.000200 time 0.2507 (0.2573) loss 0.3549 (0.3582) grad_norm 570175.2500 (inf) mem 14543MB +[2023-10-13 02:20:20 simmim_pretrain](main_simmim.py 218): INFO Train: [149/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2587 (0.2574) loss 0.3556 (0.3580) grad_norm 495990.3125 (inf) mem 14543MB +[2023-10-13 02:21:35 simmim_pretrain](main_simmim.py 228): INFO EPOCH 149 training takes 0:29:07 +[2023-10-13 02:21:36 simmim_pretrain](main_simmim.py 218): INFO Train: [150/200][0/6787] eta 2:42:07 lr 0.000200 time 1.4332 (1.4332) loss 0.3699 (0.3699) grad_norm 353860.7500 (353860.7500) mem 14543MB +[2023-10-13 02:23:45 simmim_pretrain](main_simmim.py 218): INFO Train: [150/200][500/6787] eta 0:27:14 lr 0.000200 time 0.2555 (0.2599) loss 0.3398 (0.3568) grad_norm 455074.1562 (inf) mem 14543MB +[2023-10-13 02:25:53 simmim_pretrain](main_simmim.py 218): INFO Train: [150/200][1000/6787] eta 0:24:56 lr 0.000200 time 0.2596 (0.2585) loss 0.3575 (0.3577) grad_norm 196789.5312 (inf) mem 14543MB +[2023-10-13 02:28:02 simmim_pretrain](main_simmim.py 218): INFO Train: [150/200][1500/6787] eta 0:22:43 lr 0.000200 time 0.2540 (0.2579) loss 0.3870 (0.3583) grad_norm 336320.4688 (inf) mem 14543MB +[2023-10-13 02:30:10 simmim_pretrain](main_simmim.py 218): INFO Train: [150/200][2000/6787] eta 0:20:33 lr 0.000200 time 0.2529 (0.2576) loss 0.3658 (0.3583) grad_norm 150733.2344 (inf) mem 14543MB +[2023-10-13 02:32:19 simmim_pretrain](main_simmim.py 218): INFO Train: [150/200][2500/6787] eta 0:18:24 lr 0.000200 time 0.2597 (0.2576) loss 0.3405 (0.3584) grad_norm 273879.8125 (inf) mem 14543MB +[2023-10-13 02:34:28 simmim_pretrain](main_simmim.py 218): INFO Train: [150/200][3000/6787] eta 0:16:15 lr 0.000200 time 0.2562 (0.2576) loss 0.3691 (0.3583) grad_norm 507146.6562 (inf) mem 14543MB +[2023-10-13 02:36:37 simmim_pretrain](main_simmim.py 218): INFO Train: [150/200][3500/6787] eta 0:14:06 lr 0.000200 time 0.2575 (0.2576) loss 0.3571 (0.3583) grad_norm 379908.3750 (inf) mem 14543MB +[2023-10-13 02:38:45 simmim_pretrain](main_simmim.py 218): INFO Train: [150/200][4000/6787] eta 0:11:57 lr 0.000200 time 0.2546 (0.2575) loss 0.3515 (0.3582) grad_norm 299977.9688 (inf) mem 14543MB +[2023-10-13 02:40:54 simmim_pretrain](main_simmim.py 218): INFO Train: [150/200][4500/6787] eta 0:09:49 lr 0.000200 time 0.2569 (0.2576) loss 0.3840 (0.3583) grad_norm 225012.5156 (inf) mem 14543MB +[2023-10-13 02:43:03 simmim_pretrain](main_simmim.py 218): INFO Train: [150/200][5000/6787] eta 0:07:40 lr 0.000200 time 0.2543 (0.2575) loss 0.3767 (0.3583) grad_norm 239171.5312 (inf) mem 14543MB +[2023-10-13 02:45:11 simmim_pretrain](main_simmim.py 218): INFO Train: [150/200][5500/6787] eta 0:05:31 lr 0.000200 time 0.2592 (0.2575) loss 0.5483 (0.3655) grad_norm 1782.6455 (inf) mem 14543MB +[2023-10-13 02:47:20 simmim_pretrain](main_simmim.py 218): INFO Train: [150/200][6000/6787] eta 0:03:22 lr 0.000200 time 0.2516 (0.2575) loss 0.5035 (0.3772) grad_norm 20636.1270 (inf) mem 14543MB +[2023-10-13 02:49:28 simmim_pretrain](main_simmim.py 218): INFO Train: [150/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2605 (0.2574) loss 0.4623 (0.3849) grad_norm 25645.9219 (inf) mem 14543MB +[2023-10-13 02:50:42 simmim_pretrain](main_simmim.py 228): INFO EPOCH 150 training takes 0:29:07 +[2023-10-13 02:50:44 simmim_pretrain](main_simmim.py 218): INFO Train: [151/200][0/6787] eta 2:38:48 lr 0.000200 time 1.4040 (1.4040) loss 0.4102 (0.4102) grad_norm 13462.2559 (13462.2559) mem 14543MB +[2023-10-13 02:52:53 simmim_pretrain](main_simmim.py 218): INFO Train: [151/200][500/6787] eta 0:27:15 lr 0.000200 time 0.2570 (0.2601) loss 0.3867 (0.4030) grad_norm 17193.7891 (15487.4199) mem 14543MB +[2023-10-13 02:55:01 simmim_pretrain](main_simmim.py 218): INFO Train: [151/200][1000/6787] eta 0:24:55 lr 0.000200 time 0.2575 (0.2584) loss 0.3982 (0.3895) grad_norm 20401.3320 (18724.9551) mem 14543MB +[2023-10-13 02:57:09 simmim_pretrain](main_simmim.py 218): INFO Train: [151/200][1500/6787] eta 0:22:43 lr 0.000200 time 0.2523 (0.2578) loss 0.3558 (0.3831) grad_norm 21270.8145 (20845.8789) mem 14543MB +[2023-10-13 02:59:17 simmim_pretrain](main_simmim.py 218): INFO Train: [151/200][2000/6787] eta 0:20:32 lr 0.000200 time 0.2557 (0.2574) loss 0.3790 (0.3791) grad_norm 23940.1211 (21782.5605) mem 14543MB +[2023-10-13 03:01:26 simmim_pretrain](main_simmim.py 218): INFO Train: [151/200][2500/6787] eta 0:18:23 lr 0.000200 time 0.2536 (0.2574) loss 0.3604 (0.3767) grad_norm 81420.7109 (22379.1445) mem 14543MB +[2023-10-13 03:03:34 simmim_pretrain](main_simmim.py 218): INFO Train: [151/200][3000/6787] eta 0:16:14 lr 0.000200 time 0.2546 (0.2573) loss 0.3600 (0.3747) grad_norm 77059.1797 (24392.2520) mem 14543MB +[2023-10-13 03:05:43 simmim_pretrain](main_simmim.py 218): INFO Train: [151/200][3500/6787] eta 0:14:05 lr 0.000200 time 0.2585 (0.2573) loss 0.3695 (0.3729) grad_norm 44571.5820 (27122.8027) mem 14543MB +[2023-10-13 03:07:52 simmim_pretrain](main_simmim.py 218): INFO Train: [151/200][4000/6787] eta 0:11:57 lr 0.000200 time 0.2573 (0.2574) loss 0.3584 (0.3716) grad_norm 38739.6602 (28973.2324) mem 14543MB +[2023-10-13 03:10:00 simmim_pretrain](main_simmim.py 218): INFO Train: [151/200][4500/6787] eta 0:09:48 lr 0.000200 time 0.2578 (0.2573) loss 0.3468 (0.3705) grad_norm 62294.9141 (30826.6465) mem 14543MB +[2023-10-13 03:12:09 simmim_pretrain](main_simmim.py 218): INFO Train: [151/200][5000/6787] eta 0:07:39 lr 0.000200 time 0.2581 (0.2573) loss 0.3878 (0.3695) grad_norm 57446.9414 (34103.2031) mem 14543MB +[2023-10-13 03:14:17 simmim_pretrain](main_simmim.py 218): INFO Train: [151/200][5500/6787] eta 0:05:31 lr 0.000200 time 0.2605 (0.2572) loss 0.3756 (0.3688) grad_norm 106045.9844 (37038.3789) mem 14543MB +[2023-10-13 03:16:25 simmim_pretrain](main_simmim.py 218): INFO Train: [151/200][6000/6787] eta 0:03:22 lr 0.000200 time 0.2614 (0.2571) loss 0.3640 (0.3680) grad_norm 109342.7734 (40335.4336) mem 14543MB +[2023-10-13 03:18:35 simmim_pretrain](main_simmim.py 218): INFO Train: [151/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2597 (0.2573) loss 0.3543 (0.3674) grad_norm 205913.9531 (43840.9336) mem 14543MB +[2023-10-13 03:19:50 simmim_pretrain](main_simmim.py 228): INFO EPOCH 151 training takes 0:29:07 +[2023-10-13 03:19:52 simmim_pretrain](main_simmim.py 218): INFO Train: [152/200][0/6787] eta 2:46:33 lr 0.000200 time 1.4724 (1.4724) loss 0.3513 (0.3513) grad_norm 111140.8203 (111140.8203) mem 14543MB +[2023-10-13 03:21:58 simmim_pretrain](main_simmim.py 218): INFO Train: [152/200][500/6787] eta 0:26:38 lr 0.000200 time 0.2522 (0.2542) loss 0.3485 (0.3579) grad_norm 129025.0234 (118477.9375) mem 14543MB +[2023-10-13 03:24:04 simmim_pretrain](main_simmim.py 218): INFO Train: [152/200][1000/6787] eta 0:24:26 lr 0.000200 time 0.2592 (0.2534) loss 0.3452 (0.3585) grad_norm 194501.3438 (126527.5391) mem 14543MB +[2023-10-13 03:26:10 simmim_pretrain](main_simmim.py 218): INFO Train: [152/200][1500/6787] eta 0:22:17 lr 0.000200 time 0.2587 (0.2529) loss 0.3579 (0.3590) grad_norm 118416.9219 (135834.5625) mem 14543MB +[2023-10-13 03:28:16 simmim_pretrain](main_simmim.py 218): INFO Train: [152/200][2000/6787] eta 0:20:10 lr 0.000200 time 0.2530 (0.2528) loss 0.3569 (0.3588) grad_norm 369486.2500 (149380.3906) mem 14543MB +[2023-10-13 03:30:23 simmim_pretrain](main_simmim.py 218): INFO Train: [152/200][2500/6787] eta 0:18:03 lr 0.000200 time 0.2524 (0.2528) loss 0.3456 (0.3586) grad_norm 197205.8125 (164560.0000) mem 14543MB +[2023-10-13 03:32:30 simmim_pretrain](main_simmim.py 218): INFO Train: [152/200][3000/6787] eta 0:15:58 lr 0.000200 time 0.2481 (0.2531) loss 0.3715 (0.3584) grad_norm 286720.9688 (201639.7812) mem 14543MB +[2023-10-13 03:34:37 simmim_pretrain](main_simmim.py 218): INFO Train: [152/200][3500/6787] eta 0:13:52 lr 0.000200 time 0.2549 (0.2533) loss 0.3885 (0.3583) grad_norm 218390.5781 (inf) mem 14543MB +[2023-10-13 03:36:45 simmim_pretrain](main_simmim.py 218): INFO Train: [152/200][4000/6787] eta 0:11:46 lr 0.000200 time 0.2547 (0.2536) loss 0.3567 (0.3584) grad_norm 144553.0000 (inf) mem 14543MB +[2023-10-13 03:38:53 simmim_pretrain](main_simmim.py 218): INFO Train: [152/200][4500/6787] eta 0:09:40 lr 0.000200 time 0.2543 (0.2538) loss 0.3598 (0.3584) grad_norm 217747.8594 (inf) mem 14543MB +[2023-10-13 03:41:01 simmim_pretrain](main_simmim.py 218): INFO Train: [152/200][5000/6787] eta 0:07:34 lr 0.000200 time 0.2557 (0.2541) loss 0.3733 (0.3582) grad_norm 178488.0469 (inf) mem 14543MB +[2023-10-13 03:43:09 simmim_pretrain](main_simmim.py 218): INFO Train: [152/200][5500/6787] eta 0:05:27 lr 0.000200 time 0.2609 (0.2543) loss 0.3376 (0.3584) grad_norm 152598.9375 (inf) mem 14543MB +[2023-10-13 03:45:18 simmim_pretrain](main_simmim.py 218): INFO Train: [152/200][6000/6787] eta 0:03:20 lr 0.000200 time 0.2548 (0.2547) loss 0.3764 (0.3586) grad_norm 88212.9766 (inf) mem 14543MB +[2023-10-13 03:47:27 simmim_pretrain](main_simmim.py 218): INFO Train: [152/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2516 (0.2548) loss 0.3510 (0.3587) grad_norm 114580.9141 (inf) mem 14543MB +[2023-10-13 03:48:40 simmim_pretrain](main_simmim.py 228): INFO EPOCH 152 training takes 0:28:50 +[2023-10-13 03:48:42 simmim_pretrain](main_simmim.py 218): INFO Train: [153/200][0/6787] eta 2:53:06 lr 0.000200 time 1.5303 (1.5303) loss 0.3431 (0.3431) grad_norm 88908.8828 (88908.8828) mem 14543MB +[2023-10-13 03:50:50 simmim_pretrain](main_simmim.py 218): INFO Train: [153/200][500/6787] eta 0:26:59 lr 0.000200 time 0.2538 (0.2575) loss 0.3524 (0.3590) grad_norm 71872.1328 (104846.9062) mem 14543MB +[2023-10-13 03:52:58 simmim_pretrain](main_simmim.py 218): INFO Train: [153/200][1000/6787] eta 0:24:51 lr 0.000200 time 0.2616 (0.2577) loss 0.3828 (0.3587) grad_norm 171802.3281 (116874.8906) mem 14543MB +[2023-10-13 03:55:09 simmim_pretrain](main_simmim.py 218): INFO Train: [153/200][1500/6787] eta 0:22:46 lr 0.000200 time 0.2613 (0.2585) loss 0.3540 (0.3584) grad_norm 376390.4062 (128587.7812) mem 14543MB +[2023-10-13 03:57:19 simmim_pretrain](main_simmim.py 218): INFO Train: [153/200][2000/6787] eta 0:20:40 lr 0.000200 time 0.2547 (0.2590) loss 0.3691 (0.3585) grad_norm 193374.7656 (147317.0312) mem 14543MB +[2023-10-13 03:59:27 simmim_pretrain](main_simmim.py 218): INFO Train: [153/200][2500/6787] eta 0:18:28 lr 0.000200 time 0.2567 (0.2586) loss 0.3634 (0.3583) grad_norm 145557.3906 (154220.4531) mem 14543MB +[2023-10-13 04:01:35 simmim_pretrain](main_simmim.py 218): INFO Train: [153/200][3000/6787] eta 0:16:17 lr 0.000200 time 0.2565 (0.2582) loss 0.3662 (0.3579) grad_norm 259877.6094 (176576.6875) mem 14543MB +[2023-10-13 04:03:45 simmim_pretrain](main_simmim.py 218): INFO Train: [153/200][3500/6787] eta 0:14:08 lr 0.000200 time 0.2606 (0.2582) loss 0.3922 (0.3578) grad_norm 456254.2500 (187944.7031) mem 14543MB +[2023-10-13 04:05:53 simmim_pretrain](main_simmim.py 218): INFO Train: [153/200][4000/6787] eta 0:11:59 lr 0.000200 time 0.2536 (0.2581) loss 0.3480 (0.3576) grad_norm 393385.7500 (200468.7344) mem 14543MB +[2023-10-13 04:08:01 simmim_pretrain](main_simmim.py 218): INFO Train: [153/200][4500/6787] eta 0:09:49 lr 0.000200 time 0.2591 (0.2577) loss 0.3633 (0.3575) grad_norm 300445.6875 (218728.2188) mem 14543MB +[2023-10-13 04:10:09 simmim_pretrain](main_simmim.py 218): INFO Train: [153/200][5000/6787] eta 0:07:40 lr 0.000200 time 0.2532 (0.2576) loss 0.3641 (0.3574) grad_norm 391887.3750 (243255.8750) mem 14543MB +[2023-10-13 04:12:17 simmim_pretrain](main_simmim.py 218): INFO Train: [153/200][5500/6787] eta 0:05:31 lr 0.000200 time 0.2528 (0.2574) loss 0.3732 (0.3573) grad_norm 394974.6250 (inf) mem 14543MB +[2023-10-13 04:14:24 simmim_pretrain](main_simmim.py 218): INFO Train: [153/200][6000/6787] eta 0:03:22 lr 0.000200 time 0.2462 (0.2572) loss 0.3477 (0.3572) grad_norm 362436.6562 (inf) mem 14543MB +[2023-10-13 04:16:31 simmim_pretrain](main_simmim.py 218): INFO Train: [153/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2526 (0.2570) loss 0.3289 (0.3572) grad_norm 457967.6875 (inf) mem 14543MB +[2023-10-13 04:17:45 simmim_pretrain](main_simmim.py 228): INFO EPOCH 153 training takes 0:29:04 +[2023-10-13 04:17:47 simmim_pretrain](main_simmim.py 218): INFO Train: [154/200][0/6787] eta 2:49:52 lr 0.000200 time 1.5018 (1.5018) loss 0.3556 (0.3556) grad_norm 351122.5000 (351122.5000) mem 14543MB +[2023-10-13 04:19:53 simmim_pretrain](main_simmim.py 218): INFO Train: [154/200][500/6787] eta 0:26:46 lr 0.000200 time 0.2496 (0.2556) loss 0.3384 (0.3568) grad_norm 426157.5312 (365153.0938) mem 14543MB +[2023-10-13 04:22:00 simmim_pretrain](main_simmim.py 218): INFO Train: [154/200][1000/6787] eta 0:24:32 lr 0.000200 time 0.2513 (0.2545) loss 0.3527 (0.3566) grad_norm 462092.5312 (inf) mem 14543MB +[2023-10-13 04:24:07 simmim_pretrain](main_simmim.py 218): INFO Train: [154/200][1500/6787] eta 0:22:24 lr 0.000200 time 0.2565 (0.2543) loss 0.3599 (0.3566) grad_norm 362926.0625 (inf) mem 14543MB +[2023-10-13 04:26:13 simmim_pretrain](main_simmim.py 218): INFO Train: [154/200][2000/6787] eta 0:20:15 lr 0.000200 time 0.2546 (0.2540) loss 0.3437 (0.3565) grad_norm 385189.7500 (inf) mem 14543MB +[2023-10-13 04:28:20 simmim_pretrain](main_simmim.py 218): INFO Train: [154/200][2500/6787] eta 0:18:08 lr 0.000200 time 0.2525 (0.2539) loss 0.3511 (0.3565) grad_norm 380030.4375 (inf) mem 14543MB +[2023-10-13 04:30:27 simmim_pretrain](main_simmim.py 218): INFO Train: [154/200][3000/6787] eta 0:16:01 lr 0.000200 time 0.2529 (0.2538) loss 0.3349 (0.3565) grad_norm 497230.6562 (inf) mem 14543MB +[2023-10-13 04:32:34 simmim_pretrain](main_simmim.py 218): INFO Train: [154/200][3500/6787] eta 0:13:54 lr 0.000200 time 0.2541 (0.2538) loss 0.3668 (0.3564) grad_norm 478590.4688 (inf) mem 14543MB +[2023-10-13 04:34:40 simmim_pretrain](main_simmim.py 218): INFO Train: [154/200][4000/6787] eta 0:11:47 lr 0.000200 time 0.2537 (0.2537) loss 0.3513 (0.3563) grad_norm 353974.0000 (inf) mem 14543MB +[2023-10-13 04:36:47 simmim_pretrain](main_simmim.py 218): INFO Train: [154/200][4500/6787] eta 0:09:40 lr 0.000200 time 0.2524 (0.2538) loss 0.3305 (0.3562) grad_norm 280012.6250 (inf) mem 14543MB +[2023-10-13 04:38:54 simmim_pretrain](main_simmim.py 218): INFO Train: [154/200][5000/6787] eta 0:07:33 lr 0.000200 time 0.2522 (0.2538) loss 0.3422 (0.3562) grad_norm 510298.7812 (inf) mem 14543MB +[2023-10-13 04:41:01 simmim_pretrain](main_simmim.py 218): INFO Train: [154/200][5500/6787] eta 0:05:26 lr 0.000200 time 0.2537 (0.2538) loss 0.3462 (0.3562) grad_norm 221191.6406 (inf) mem 14543MB +[2023-10-13 04:43:07 simmim_pretrain](main_simmim.py 218): INFO Train: [154/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2525 (0.2536) loss 0.3580 (0.3563) grad_norm 277066.1562 (inf) mem 14543MB +[2023-10-13 04:45:13 simmim_pretrain](main_simmim.py 218): INFO Train: [154/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2513 (0.2535) loss 0.3458 (0.3563) grad_norm 185827.8125 (inf) mem 14543MB +[2023-10-13 04:46:26 simmim_pretrain](main_simmim.py 228): INFO EPOCH 154 training takes 0:28:40 +[2023-10-13 04:46:27 simmim_pretrain](main_simmim.py 218): INFO Train: [155/200][0/6787] eta 2:58:16 lr 0.000200 time 1.5760 (1.5760) loss 0.3474 (0.3474) grad_norm 361899.0000 (361899.0000) mem 14543MB +[2023-10-13 04:48:33 simmim_pretrain](main_simmim.py 218): INFO Train: [155/200][500/6787] eta 0:26:33 lr 0.000200 time 0.2528 (0.2535) loss 0.3478 (0.3581) grad_norm 240292.4531 (inf) mem 14543MB +[2023-10-13 04:50:38 simmim_pretrain](main_simmim.py 218): INFO Train: [155/200][1000/6787] eta 0:24:19 lr 0.000200 time 0.2524 (0.2522) loss 0.3553 (0.3577) grad_norm 197346.3281 (inf) mem 14543MB +[2023-10-13 04:52:44 simmim_pretrain](main_simmim.py 218): INFO Train: [155/200][1500/6787] eta 0:22:13 lr 0.000200 time 0.2583 (0.2522) loss 0.3633 (0.3579) grad_norm 257376.0469 (inf) mem 14543MB +[2023-10-13 04:54:51 simmim_pretrain](main_simmim.py 218): INFO Train: [155/200][2000/6787] eta 0:20:09 lr 0.000200 time 0.2579 (0.2526) loss 0.3523 (0.3580) grad_norm 124071.3906 (inf) mem 14543MB +[2023-10-13 04:56:59 simmim_pretrain](main_simmim.py 218): INFO Train: [155/200][2500/6787] eta 0:18:05 lr 0.000200 time 0.2612 (0.2532) loss 0.3725 (0.3578) grad_norm 335331.0938 (inf) mem 14543MB +[2023-10-13 04:59:08 simmim_pretrain](main_simmim.py 218): INFO Train: [155/200][3000/6787] eta 0:16:01 lr 0.000200 time 0.2609 (0.2539) loss 0.3728 (0.3579) grad_norm 255552.9844 (inf) mem 14543MB +[2023-10-13 05:01:16 simmim_pretrain](main_simmim.py 218): INFO Train: [155/200][3500/6787] eta 0:13:56 lr 0.000200 time 0.2471 (0.2544) loss 0.3641 (0.3577) grad_norm 201499.5625 (inf) mem 14543MB +[2023-10-13 05:03:25 simmim_pretrain](main_simmim.py 218): INFO Train: [155/200][4000/6787] eta 0:11:49 lr 0.000200 time 0.2567 (0.2547) loss 0.3288 (0.3577) grad_norm 186453.6406 (inf) mem 14543MB +[2023-10-13 05:05:33 simmim_pretrain](main_simmim.py 218): INFO Train: [155/200][4500/6787] eta 0:09:42 lr 0.000200 time 0.2533 (0.2549) loss 0.3303 (0.3576) grad_norm 213723.7969 (inf) mem 14543MB +[2023-10-13 05:07:42 simmim_pretrain](main_simmim.py 218): INFO Train: [155/200][5000/6787] eta 0:07:36 lr 0.000200 time 0.2602 (0.2553) loss 0.3532 (0.3577) grad_norm 192604.1875 (inf) mem 14543MB +[2023-10-13 05:09:52 simmim_pretrain](main_simmim.py 218): INFO Train: [155/200][5500/6787] eta 0:05:29 lr 0.000200 time 0.2603 (0.2557) loss 0.3498 (0.3576) grad_norm 272069.6562 (inf) mem 14543MB +[2023-10-13 05:12:01 simmim_pretrain](main_simmim.py 218): INFO Train: [155/200][6000/6787] eta 0:03:21 lr 0.000200 time 0.2572 (0.2558) loss 0.3418 (0.3576) grad_norm 310849.5000 (inf) mem 14543MB +[2023-10-13 05:14:10 simmim_pretrain](main_simmim.py 218): INFO Train: [155/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2596 (0.2560) loss 0.3525 (0.3577) grad_norm 389106.2188 (inf) mem 14543MB +[2023-10-13 05:15:24 simmim_pretrain](main_simmim.py 228): INFO EPOCH 155 training takes 0:28:58 +[2023-10-13 05:15:25 simmim_pretrain](main_simmim.py 218): INFO Train: [156/200][0/6787] eta 2:44:36 lr 0.000200 time 1.4553 (1.4553) loss 0.3663 (0.3663) grad_norm 371849.5000 (371849.5000) mem 14543MB +[2023-10-13 05:17:34 simmim_pretrain](main_simmim.py 218): INFO Train: [156/200][500/6787] eta 0:27:13 lr 0.000200 time 0.2593 (0.2598) loss 0.3414 (0.3566) grad_norm 493985.0938 (377549.1875) mem 14543MB +[2023-10-13 05:19:44 simmim_pretrain](main_simmim.py 218): INFO Train: [156/200][1000/6787] eta 0:25:05 lr 0.000200 time 0.2611 (0.2601) loss 0.3656 (0.3564) grad_norm 274734.3750 (inf) mem 14543MB +[2023-10-13 05:21:54 simmim_pretrain](main_simmim.py 218): INFO Train: [156/200][1500/6787] eta 0:22:55 lr 0.000200 time 0.2573 (0.2602) loss 0.3597 (0.3565) grad_norm 360929.5938 (inf) mem 14543MB +[2023-10-13 05:24:04 simmim_pretrain](main_simmim.py 218): INFO Train: [156/200][2000/6787] eta 0:20:45 lr 0.000200 time 0.2590 (0.2602) loss 0.3390 (0.3563) grad_norm 528151.4375 (inf) mem 14543MB +[2023-10-13 05:26:15 simmim_pretrain](main_simmim.py 218): INFO Train: [156/200][2500/6787] eta 0:18:35 lr 0.000200 time 0.2596 (0.2603) loss 0.3266 (0.3563) grad_norm 295734.4688 (inf) mem 14543MB +[2023-10-13 05:28:25 simmim_pretrain](main_simmim.py 218): INFO Train: [156/200][3000/6787] eta 0:16:25 lr 0.000200 time 0.2611 (0.2604) loss 0.3476 (0.3562) grad_norm 489478.5625 (inf) mem 14543MB +[2023-10-13 05:30:36 simmim_pretrain](main_simmim.py 218): INFO Train: [156/200][3500/6787] eta 0:14:15 lr 0.000200 time 0.2611 (0.2604) loss 0.3662 (0.3563) grad_norm 518558.8750 (inf) mem 14543MB +[2023-10-13 05:32:46 simmim_pretrain](main_simmim.py 218): INFO Train: [156/200][4000/6787] eta 0:12:05 lr 0.000200 time 0.2607 (0.2604) loss 0.3533 (0.3565) grad_norm 171952.5000 (inf) mem 14543MB +[2023-10-13 05:34:56 simmim_pretrain](main_simmim.py 218): INFO Train: [156/200][4500/6787] eta 0:09:55 lr 0.000200 time 0.2608 (0.2605) loss 0.3420 (0.3567) grad_norm 240473.8281 (inf) mem 14543MB +[2023-10-13 05:37:07 simmim_pretrain](main_simmim.py 218): INFO Train: [156/200][5000/6787] eta 0:07:45 lr 0.000200 time 0.2606 (0.2605) loss 0.3475 (0.3570) grad_norm 279045.9375 (inf) mem 14543MB +[2023-10-13 05:39:17 simmim_pretrain](main_simmim.py 218): INFO Train: [156/200][5500/6787] eta 0:05:35 lr 0.000200 time 0.2611 (0.2605) loss 0.3566 (0.3572) grad_norm 171222.9531 (inf) mem 14543MB +[2023-10-13 05:41:27 simmim_pretrain](main_simmim.py 218): INFO Train: [156/200][6000/6787] eta 0:03:25 lr 0.000200 time 0.2607 (0.2605) loss 0.3543 (0.3571) grad_norm 490550.9062 (inf) mem 14543MB +[2023-10-13 05:43:38 simmim_pretrain](main_simmim.py 218): INFO Train: [156/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2611 (0.2605) loss 0.3655 (0.3571) grad_norm 328912.0625 (inf) mem 14543MB +[2023-10-13 05:44:53 simmim_pretrain](main_simmim.py 228): INFO EPOCH 156 training takes 0:29:29 +[2023-10-13 05:44:54 simmim_pretrain](main_simmim.py 218): INFO Train: [157/200][0/6787] eta 2:42:18 lr 0.000200 time 1.4348 (1.4348) loss 0.3433 (0.3433) grad_norm 537741.4375 (537741.4375) mem 14543MB +[2023-10-13 05:47:02 simmim_pretrain](main_simmim.py 218): INFO Train: [157/200][500/6787] eta 0:26:59 lr 0.000200 time 0.2589 (0.2576) loss 0.3615 (0.3575) grad_norm 281209.5312 (inf) mem 14543MB +[2023-10-13 05:49:10 simmim_pretrain](main_simmim.py 218): INFO Train: [157/200][1000/6787] eta 0:24:47 lr 0.000200 time 0.2545 (0.2571) loss 0.3620 (0.3593) grad_norm 114405.8750 (inf) mem 14543MB +[2023-10-13 05:51:20 simmim_pretrain](main_simmim.py 218): INFO Train: [157/200][1500/6787] eta 0:22:44 lr 0.000200 time 0.2612 (0.2580) loss 0.3707 (0.3593) grad_norm 122775.2891 (inf) mem 14543MB +[2023-10-13 05:53:30 simmim_pretrain](main_simmim.py 218): INFO Train: [157/200][2000/6787] eta 0:20:37 lr 0.000200 time 0.2613 (0.2586) loss 0.3496 (0.3594) grad_norm 175143.0469 (inf) mem 14543MB +[2023-10-13 05:55:40 simmim_pretrain](main_simmim.py 218): INFO Train: [157/200][2500/6787] eta 0:18:30 lr 0.000200 time 0.2610 (0.2589) loss 0.3827 (0.3598) grad_norm 243360.6562 (inf) mem 14543MB +[2023-10-13 05:57:50 simmim_pretrain](main_simmim.py 218): INFO Train: [157/200][3000/6787] eta 0:16:21 lr 0.000200 time 0.2608 (0.2591) loss 0.3679 (0.3598) grad_norm 299998.3750 (inf) mem 14543MB +[2023-10-13 06:00:00 simmim_pretrain](main_simmim.py 218): INFO Train: [157/200][3500/6787] eta 0:14:12 lr 0.000200 time 0.2605 (0.2592) loss 0.3591 (0.3596) grad_norm 209698.4531 (inf) mem 14543MB +[2023-10-13 06:02:10 simmim_pretrain](main_simmim.py 218): INFO Train: [157/200][4000/6787] eta 0:12:02 lr 0.000200 time 0.2605 (0.2593) loss 0.3815 (0.3595) grad_norm 160190.5156 (inf) mem 14543MB +[2023-10-13 06:04:20 simmim_pretrain](main_simmim.py 218): INFO Train: [157/200][4500/6787] eta 0:09:53 lr 0.000200 time 0.2566 (0.2593) loss 0.3420 (0.3594) grad_norm 474693.3438 (inf) mem 14543MB +[2023-10-13 06:06:30 simmim_pretrain](main_simmim.py 218): INFO Train: [157/200][5000/6787] eta 0:07:43 lr 0.000200 time 0.2596 (0.2593) loss 0.3503 (0.3592) grad_norm 365059.6250 (inf) mem 14543MB +[2023-10-13 06:08:38 simmim_pretrain](main_simmim.py 218): INFO Train: [157/200][5500/6787] eta 0:05:33 lr 0.000200 time 0.2577 (0.2591) loss 0.3428 (0.3589) grad_norm 563106.5000 (inf) mem 14543MB +[2023-10-13 06:10:46 simmim_pretrain](main_simmim.py 218): INFO Train: [157/200][6000/6787] eta 0:03:23 lr 0.000200 time 0.2567 (0.2589) loss 0.3734 (0.3587) grad_norm 256868.7812 (inf) mem 14543MB +[2023-10-13 06:12:54 simmim_pretrain](main_simmim.py 218): INFO Train: [157/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2545 (0.2586) loss 0.3443 (0.3586) grad_norm 296856.0938 (inf) mem 14543MB +[2023-10-13 06:14:08 simmim_pretrain](main_simmim.py 228): INFO EPOCH 157 training takes 0:29:14 +[2023-10-13 06:14:09 simmim_pretrain](main_simmim.py 218): INFO Train: [158/200][0/6787] eta 2:45:09 lr 0.000200 time 1.4601 (1.4601) loss 0.3510 (0.3510) grad_norm 258717.1719 (258717.1719) mem 14543MB +[2023-10-13 06:16:14 simmim_pretrain](main_simmim.py 218): INFO Train: [158/200][500/6787] eta 0:26:22 lr 0.000200 time 0.2462 (0.2517) loss 0.3545 (0.3582) grad_norm 230107.0312 (233489.0312) mem 14543MB +[2023-10-13 06:18:19 simmim_pretrain](main_simmim.py 218): INFO Train: [158/200][1000/6787] eta 0:24:10 lr 0.000200 time 0.2463 (0.2507) loss 0.3629 (0.3583) grad_norm 276626.1250 (231824.6406) mem 14543MB +[2023-10-13 06:20:24 simmim_pretrain](main_simmim.py 218): INFO Train: [158/200][1500/6787] eta 0:22:04 lr 0.000200 time 0.2483 (0.2505) loss 0.3516 (0.3582) grad_norm 495431.4688 (238631.6094) mem 14543MB +[2023-10-13 06:22:29 simmim_pretrain](main_simmim.py 218): INFO Train: [158/200][2000/6787] eta 0:19:57 lr 0.000200 time 0.2530 (0.2503) loss 0.3694 (0.3580) grad_norm 418535.8125 (263063.8125) mem 14543MB +[2023-10-13 06:24:33 simmim_pretrain](main_simmim.py 218): INFO Train: [158/200][2500/6787] eta 0:17:52 lr 0.000200 time 0.2491 (0.2502) loss 0.3729 (0.3578) grad_norm 301516.2812 (inf) mem 14543MB +[2023-10-13 06:26:38 simmim_pretrain](main_simmim.py 218): INFO Train: [158/200][3000/6787] eta 0:15:47 lr 0.000200 time 0.2482 (0.2501) loss 0.3497 (0.3578) grad_norm 221309.6719 (inf) mem 14543MB +[2023-10-13 06:28:43 simmim_pretrain](main_simmim.py 218): INFO Train: [158/200][3500/6787] eta 0:13:41 lr 0.000200 time 0.2473 (0.2500) loss 0.3421 (0.3578) grad_norm 144594.2500 (inf) mem 14543MB +[2023-10-13 06:30:48 simmim_pretrain](main_simmim.py 218): INFO Train: [158/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2587 (0.2499) loss 0.3508 (0.3579) grad_norm 203066.7812 (inf) mem 14543MB +[2023-10-13 06:32:52 simmim_pretrain](main_simmim.py 218): INFO Train: [158/200][4500/6787] eta 0:09:31 lr 0.000200 time 0.2479 (0.2499) loss 0.3603 (0.3580) grad_norm 316892.8750 (inf) mem 14543MB +[2023-10-13 06:34:58 simmim_pretrain](main_simmim.py 218): INFO Train: [158/200][5000/6787] eta 0:07:26 lr 0.000200 time 0.2515 (0.2500) loss 0.3322 (0.3578) grad_norm 346415.8750 (inf) mem 14543MB +[2023-10-13 06:37:04 simmim_pretrain](main_simmim.py 218): INFO Train: [158/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2562 (0.2502) loss 0.3629 (0.3578) grad_norm 288676.2812 (inf) mem 14543MB +[2023-10-13 06:39:10 simmim_pretrain](main_simmim.py 218): INFO Train: [158/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2476 (0.2504) loss 0.3564 (0.3578) grad_norm 501229.6562 (inf) mem 14543MB +[2023-10-13 06:41:14 simmim_pretrain](main_simmim.py 218): INFO Train: [158/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2494 (0.2502) loss 0.3415 (0.3576) grad_norm 917671.5000 (inf) mem 14543MB +[2023-10-13 06:42:25 simmim_pretrain](main_simmim.py 228): INFO EPOCH 158 training takes 0:28:17 +[2023-10-13 06:42:27 simmim_pretrain](main_simmim.py 218): INFO Train: [159/200][0/6787] eta 2:37:10 lr 0.000200 time 1.3895 (1.3895) loss 0.3457 (0.3457) grad_norm 353975.7188 (353975.7188) mem 14543MB +[2023-10-13 06:44:30 simmim_pretrain](main_simmim.py 218): INFO Train: [159/200][500/6787] eta 0:25:59 lr 0.000200 time 0.2452 (0.2480) loss 0.3496 (0.3549) grad_norm 606030.5625 (inf) mem 14543MB +[2023-10-13 06:46:32 simmim_pretrain](main_simmim.py 218): INFO Train: [159/200][1000/6787] eta 0:23:47 lr 0.000200 time 0.2444 (0.2466) loss 0.3409 (0.3557) grad_norm 476927.6250 (inf) mem 14543MB +[2023-10-13 06:48:34 simmim_pretrain](main_simmim.py 218): INFO Train: [159/200][1500/6787] eta 0:21:39 lr 0.000200 time 0.2444 (0.2458) loss 0.3594 (0.3563) grad_norm 431038.6250 (inf) mem 14543MB +[2023-10-13 06:50:36 simmim_pretrain](main_simmim.py 218): INFO Train: [159/200][2000/6787] eta 0:19:34 lr 0.000200 time 0.2436 (0.2453) loss 0.3656 (0.3560) grad_norm 372077.4062 (inf) mem 14543MB +[2023-10-13 06:52:38 simmim_pretrain](main_simmim.py 218): INFO Train: [159/200][2500/6787] eta 0:17:30 lr 0.000200 time 0.2433 (0.2450) loss 0.3620 (0.3561) grad_norm 426754.4062 (inf) mem 14543MB +[2023-10-13 06:54:40 simmim_pretrain](main_simmim.py 218): INFO Train: [159/200][3000/6787] eta 0:15:27 lr 0.000200 time 0.2439 (0.2448) loss 0.3565 (0.3563) grad_norm 265930.3438 (inf) mem 14543MB +[2023-10-13 06:56:42 simmim_pretrain](main_simmim.py 218): INFO Train: [159/200][3500/6787] eta 0:13:24 lr 0.000200 time 0.2435 (0.2447) loss 0.3607 (0.3571) grad_norm 153865.7969 (inf) mem 14543MB +[2023-10-13 06:58:44 simmim_pretrain](main_simmim.py 218): INFO Train: [159/200][4000/6787] eta 0:11:21 lr 0.000200 time 0.2438 (0.2445) loss 0.3483 (0.3578) grad_norm 129962.8906 (inf) mem 14543MB +[2023-10-13 07:00:46 simmim_pretrain](main_simmim.py 218): INFO Train: [159/200][4500/6787] eta 0:09:19 lr 0.000200 time 0.2437 (0.2444) loss 0.3758 (0.3581) grad_norm 124392.2969 (inf) mem 14543MB +[2023-10-13 07:02:48 simmim_pretrain](main_simmim.py 218): INFO Train: [159/200][5000/6787] eta 0:07:16 lr 0.000200 time 0.2437 (0.2444) loss 0.3437 (0.3584) grad_norm 189971.3438 (inf) mem 14543MB +[2023-10-13 07:04:49 simmim_pretrain](main_simmim.py 218): INFO Train: [159/200][5500/6787] eta 0:05:14 lr 0.000200 time 0.2437 (0.2443) loss 0.3691 (0.3585) grad_norm 146737.9531 (inf) mem 14543MB +[2023-10-13 07:06:51 simmim_pretrain](main_simmim.py 218): INFO Train: [159/200][6000/6787] eta 0:03:12 lr 0.000200 time 0.2437 (0.2443) loss 0.3781 (0.3585) grad_norm 172779.5312 (inf) mem 14543MB +[2023-10-13 07:08:53 simmim_pretrain](main_simmim.py 218): INFO Train: [159/200][6500/6787] eta 0:01:10 lr 0.000200 time 0.2436 (0.2442) loss 0.3606 (0.3585) grad_norm 181695.5312 (inf) mem 14543MB +[2023-10-13 07:10:04 simmim_pretrain](main_simmim.py 228): INFO EPOCH 159 training takes 0:27:38 +[2023-10-13 07:10:05 simmim_pretrain](main_simmim.py 218): INFO Train: [160/200][0/6787] eta 2:19:12 lr 0.000200 time 1.2306 (1.2306) loss 0.3481 (0.3481) grad_norm 102236.2812 (102236.2812) mem 14543MB +[2023-10-13 07:12:07 simmim_pretrain](main_simmim.py 218): INFO Train: [160/200][500/6787] eta 0:25:45 lr 0.000200 time 0.2439 (0.2458) loss 0.3740 (0.3583) grad_norm 174118.9844 (227118.1719) mem 14543MB +[2023-10-13 07:14:09 simmim_pretrain](main_simmim.py 218): INFO Train: [160/200][1000/6787] eta 0:23:36 lr 0.000200 time 0.2440 (0.2448) loss 0.3265 (0.3574) grad_norm 275769.4688 (253380.7500) mem 14543MB +[2023-10-13 07:16:11 simmim_pretrain](main_simmim.py 218): INFO Train: [160/200][1500/6787] eta 0:21:33 lr 0.000200 time 0.2441 (0.2446) loss 0.3455 (0.3571) grad_norm 333898.8125 (283318.9375) mem 14543MB +[2023-10-13 07:18:13 simmim_pretrain](main_simmim.py 218): INFO Train: [160/200][2000/6787] eta 0:19:30 lr 0.000200 time 0.2441 (0.2445) loss 0.3657 (0.3569) grad_norm 216874.9844 (inf) mem 14543MB +[2023-10-13 07:20:15 simmim_pretrain](main_simmim.py 218): INFO Train: [160/200][2500/6787] eta 0:17:27 lr 0.000200 time 0.2441 (0.2444) loss 0.3663 (0.3571) grad_norm 266075.2188 (inf) mem 14543MB +[2023-10-13 07:22:17 simmim_pretrain](main_simmim.py 218): INFO Train: [160/200][3000/6787] eta 0:15:25 lr 0.000200 time 0.2437 (0.2444) loss 0.3544 (0.3571) grad_norm 323675.9062 (inf) mem 14543MB +[2023-10-13 07:24:19 simmim_pretrain](main_simmim.py 218): INFO Train: [160/200][3500/6787] eta 0:13:23 lr 0.000200 time 0.2443 (0.2444) loss 0.3519 (0.3573) grad_norm 205161.0938 (inf) mem 14543MB +[2023-10-13 07:26:21 simmim_pretrain](main_simmim.py 218): INFO Train: [160/200][4000/6787] eta 0:11:20 lr 0.000200 time 0.2437 (0.2443) loss 0.3420 (0.3573) grad_norm 301832.3750 (inf) mem 14543MB +[2023-10-13 07:28:23 simmim_pretrain](main_simmim.py 218): INFO Train: [160/200][4500/6787] eta 0:09:18 lr 0.000200 time 0.2436 (0.2443) loss 0.3832 (0.3572) grad_norm 166255.3906 (inf) mem 14543MB +[2023-10-13 07:30:25 simmim_pretrain](main_simmim.py 218): INFO Train: [160/200][5000/6787] eta 0:07:16 lr 0.000200 time 0.2437 (0.2442) loss 0.3417 (0.3572) grad_norm 303188.2188 (inf) mem 14543MB +[2023-10-13 07:32:27 simmim_pretrain](main_simmim.py 218): INFO Train: [160/200][5500/6787] eta 0:05:14 lr 0.000200 time 0.2436 (0.2442) loss 0.3745 (0.3571) grad_norm 880668.4375 (inf) mem 14543MB +[2023-10-13 07:34:29 simmim_pretrain](main_simmim.py 218): INFO Train: [160/200][6000/6787] eta 0:03:12 lr 0.000200 time 0.2436 (0.2441) loss 0.3636 (0.3570) grad_norm 612333.3750 (inf) mem 14543MB +[2023-10-13 07:36:31 simmim_pretrain](main_simmim.py 218): INFO Train: [160/200][6500/6787] eta 0:01:10 lr 0.000200 time 0.2433 (0.2441) loss 0.3579 (0.3569) grad_norm 339250.9688 (inf) mem 14543MB +[2023-10-13 07:37:41 simmim_pretrain](main_simmim.py 228): INFO EPOCH 160 training takes 0:27:37 +[2023-10-13 07:37:41 simmim_pretrain](utils.py 62): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_160.pth saving...... +[2023-10-13 07:37:42 simmim_pretrain](utils.py 64): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_160.pth saved !!! +[2023-10-13 07:37:43 simmim_pretrain](main_simmim.py 218): INFO Train: [161/200][0/6787] eta 2:36:56 lr 0.000200 time 1.3875 (1.3875) loss 0.3617 (0.3617) grad_norm 220175.4844 (220175.4844) mem 14543MB +[2023-10-13 07:39:45 simmim_pretrain](main_simmim.py 218): INFO Train: [161/200][500/6787] eta 0:25:46 lr 0.000200 time 0.2440 (0.2460) loss 0.3413 (0.3574) grad_norm 341399.0938 (259755.2344) mem 14543MB +[2023-10-13 07:41:47 simmim_pretrain](main_simmim.py 218): INFO Train: [161/200][1000/6787] eta 0:23:36 lr 0.000200 time 0.2438 (0.2449) loss 0.3743 (0.3578) grad_norm 247712.6250 (251756.7656) mem 14543MB +[2023-10-13 07:43:49 simmim_pretrain](main_simmim.py 218): INFO Train: [161/200][1500/6787] eta 0:21:33 lr 0.000200 time 0.2445 (0.2446) loss 0.3647 (0.3580) grad_norm 280618.3750 (250955.6250) mem 14543MB +[2023-10-13 07:45:51 simmim_pretrain](main_simmim.py 218): INFO Train: [161/200][2000/6787] eta 0:19:30 lr 0.000200 time 0.2439 (0.2444) loss 0.3601 (0.3578) grad_norm 215340.0938 (262470.1250) mem 14543MB +[2023-10-13 07:47:53 simmim_pretrain](main_simmim.py 218): INFO Train: [161/200][2500/6787] eta 0:17:27 lr 0.000200 time 0.2444 (0.2444) loss 0.3719 (0.3577) grad_norm 214835.4219 (279945.9062) mem 14543MB +[2023-10-13 07:49:56 simmim_pretrain](main_simmim.py 218): INFO Train: [161/200][3000/6787] eta 0:15:25 lr 0.000200 time 0.2454 (0.2445) loss 0.3598 (0.3575) grad_norm 385935.8438 (304813.8750) mem 14543MB +[2023-10-13 07:51:58 simmim_pretrain](main_simmim.py 218): INFO Train: [161/200][3500/6787] eta 0:13:24 lr 0.000200 time 0.2454 (0.2447) loss 0.3507 (0.3574) grad_norm 460918.9062 (330253.8438) mem 14543MB +[2023-10-13 07:54:01 simmim_pretrain](main_simmim.py 218): INFO Train: [161/200][4000/6787] eta 0:11:22 lr 0.000200 time 0.2471 (0.2448) loss 0.3458 (0.3572) grad_norm 278269.8125 (inf) mem 14543MB +[2023-10-13 07:56:04 simmim_pretrain](main_simmim.py 218): INFO Train: [161/200][4500/6787] eta 0:09:20 lr 0.000200 time 0.2453 (0.2449) loss 0.3476 (0.3570) grad_norm 557877.1250 (inf) mem 14543MB +[2023-10-13 07:58:07 simmim_pretrain](main_simmim.py 218): INFO Train: [161/200][5000/6787] eta 0:07:17 lr 0.000200 time 0.2454 (0.2450) loss 0.3498 (0.3571) grad_norm 487547.6562 (inf) mem 14543MB +[2023-10-13 08:00:10 simmim_pretrain](main_simmim.py 218): INFO Train: [161/200][5500/6787] eta 0:05:15 lr 0.000200 time 0.2463 (0.2451) loss 0.3349 (0.3571) grad_norm 355191.7812 (inf) mem 14543MB +[2023-10-13 08:02:12 simmim_pretrain](main_simmim.py 218): INFO Train: [161/200][6000/6787] eta 0:03:12 lr 0.000200 time 0.2481 (0.2450) loss 0.3665 (0.3572) grad_norm 214859.5469 (inf) mem 14543MB +[2023-10-13 08:04:15 simmim_pretrain](main_simmim.py 218): INFO Train: [161/200][6500/6787] eta 0:01:10 lr 0.000200 time 0.2453 (0.2450) loss 0.3542 (0.3575) grad_norm 327978.1875 (inf) mem 14543MB +[2023-10-13 08:05:25 simmim_pretrain](main_simmim.py 228): INFO EPOCH 161 training takes 0:27:43 +[2023-10-13 08:05:27 simmim_pretrain](main_simmim.py 218): INFO Train: [162/200][0/6787] eta 2:28:05 lr 0.000200 time 1.3092 (1.3092) loss 0.3798 (0.3798) grad_norm 212816.1875 (212816.1875) mem 14543MB +[2023-10-13 08:07:29 simmim_pretrain](main_simmim.py 218): INFO Train: [162/200][500/6787] eta 0:25:53 lr 0.000200 time 0.2490 (0.2470) loss 0.3524 (0.3594) grad_norm 208015.3438 (255356.5000) mem 14543MB +[2023-10-13 08:09:32 simmim_pretrain](main_simmim.py 218): INFO Train: [162/200][1000/6787] eta 0:23:46 lr 0.000200 time 0.2445 (0.2464) loss 0.3522 (0.3579) grad_norm 313810.5000 (293862.6875) mem 14543MB +[2023-10-13 08:11:35 simmim_pretrain](main_simmim.py 218): INFO Train: [162/200][1500/6787] eta 0:21:41 lr 0.000200 time 0.2488 (0.2462) loss 0.3710 (0.3575) grad_norm 428011.2188 (323607.4062) mem 14543MB +[2023-10-13 08:13:38 simmim_pretrain](main_simmim.py 218): INFO Train: [162/200][2000/6787] eta 0:19:38 lr 0.000200 time 0.2456 (0.2461) loss 0.3625 (0.3572) grad_norm 470220.7812 (359089.8438) mem 14543MB +[2023-10-13 08:15:41 simmim_pretrain](main_simmim.py 218): INFO Train: [162/200][2500/6787] eta 0:17:35 lr 0.000200 time 0.2440 (0.2461) loss 0.3270 (0.3570) grad_norm 681758.8750 (inf) mem 14543MB +[2023-10-13 08:17:44 simmim_pretrain](main_simmim.py 218): INFO Train: [162/200][3000/6787] eta 0:15:31 lr 0.000200 time 0.2443 (0.2461) loss 0.3553 (0.3568) grad_norm 506910.2812 (inf) mem 14543MB +[2023-10-13 08:19:47 simmim_pretrain](main_simmim.py 218): INFO Train: [162/200][3500/6787] eta 0:13:28 lr 0.000200 time 0.2445 (0.2460) loss 0.3528 (0.3570) grad_norm 448595.9688 (inf) mem 14543MB +[2023-10-13 08:21:50 simmim_pretrain](main_simmim.py 218): INFO Train: [162/200][4000/6787] eta 0:11:25 lr 0.000200 time 0.2438 (0.2460) loss 0.3693 (0.3578) grad_norm 127302.8203 (inf) mem 14543MB +[2023-10-13 08:23:52 simmim_pretrain](main_simmim.py 218): INFO Train: [162/200][4500/6787] eta 0:09:22 lr 0.000200 time 0.2490 (0.2460) loss 0.3700 (0.3582) grad_norm 140054.6406 (inf) mem 14543MB +[2023-10-13 08:25:55 simmim_pretrain](main_simmim.py 218): INFO Train: [162/200][5000/6787] eta 0:07:19 lr 0.000200 time 0.2444 (0.2459) loss 0.3632 (0.3584) grad_norm 168065.4844 (inf) mem 14543MB +[2023-10-13 08:27:58 simmim_pretrain](main_simmim.py 218): INFO Train: [162/200][5500/6787] eta 0:05:16 lr 0.000200 time 0.2444 (0.2459) loss 0.3488 (0.3586) grad_norm 139496.7188 (inf) mem 14543MB +[2023-10-13 08:30:01 simmim_pretrain](main_simmim.py 218): INFO Train: [162/200][6000/6787] eta 0:03:13 lr 0.000200 time 0.2442 (0.2459) loss 0.3654 (0.3587) grad_norm 191229.7031 (inf) mem 14543MB +[2023-10-13 08:32:04 simmim_pretrain](main_simmim.py 218): INFO Train: [162/200][6500/6787] eta 0:01:10 lr 0.000200 time 0.2439 (0.2459) loss 0.3501 (0.3587) grad_norm 155272.0156 (inf) mem 14543MB +[2023-10-13 08:33:15 simmim_pretrain](main_simmim.py 228): INFO EPOCH 162 training takes 0:27:49 +[2023-10-13 08:33:16 simmim_pretrain](main_simmim.py 218): INFO Train: [163/200][0/6787] eta 2:42:09 lr 0.000200 time 1.4336 (1.4336) loss 0.3827 (0.3827) grad_norm 158166.1562 (158166.1562) mem 14543MB +[2023-10-13 08:35:19 simmim_pretrain](main_simmim.py 218): INFO Train: [163/200][500/6787] eta 0:25:59 lr 0.000200 time 0.2452 (0.2481) loss 0.3446 (0.3583) grad_norm 155169.2969 (226022.0625) mem 14543MB +[2023-10-13 08:37:22 simmim_pretrain](main_simmim.py 218): INFO Train: [163/200][1000/6787] eta 0:23:48 lr 0.000200 time 0.2446 (0.2469) loss 0.3435 (0.3578) grad_norm 180997.2500 (259132.4688) mem 14543MB +[2023-10-13 08:39:25 simmim_pretrain](main_simmim.py 218): INFO Train: [163/200][1500/6787] eta 0:21:43 lr 0.000200 time 0.2448 (0.2465) loss 0.3648 (0.3572) grad_norm 387402.8750 (284876.9062) mem 14543MB +[2023-10-13 08:41:28 simmim_pretrain](main_simmim.py 218): INFO Train: [163/200][2000/6787] eta 0:19:39 lr 0.000200 time 0.2443 (0.2463) loss 0.3480 (0.3572) grad_norm 430725.9375 (307281.5625) mem 14543MB +[2023-10-13 08:43:31 simmim_pretrain](main_simmim.py 218): INFO Train: [163/200][2500/6787] eta 0:17:35 lr 0.000200 time 0.2443 (0.2462) loss 0.3461 (0.3572) grad_norm 549763.7500 (336486.4688) mem 14543MB +[2023-10-13 08:45:34 simmim_pretrain](main_simmim.py 218): INFO Train: [163/200][3000/6787] eta 0:15:32 lr 0.000200 time 0.2446 (0.2462) loss 0.3377 (0.3571) grad_norm 535634.6250 (inf) mem 14543MB +[2023-10-13 08:47:37 simmim_pretrain](main_simmim.py 218): INFO Train: [163/200][3500/6787] eta 0:13:28 lr 0.000200 time 0.2485 (0.2461) loss 0.3735 (0.3571) grad_norm 832909.3750 (inf) mem 14543MB +[2023-10-13 08:49:39 simmim_pretrain](main_simmim.py 218): INFO Train: [163/200][4000/6787] eta 0:11:25 lr 0.000200 time 0.2481 (0.2461) loss 0.3497 (0.3570) grad_norm 372282.9688 (inf) mem 14543MB +[2023-10-13 08:51:42 simmim_pretrain](main_simmim.py 218): INFO Train: [163/200][4500/6787] eta 0:09:22 lr 0.000200 time 0.2451 (0.2460) loss 0.3623 (0.3573) grad_norm 228408.2500 (inf) mem 14543MB +[2023-10-13 08:53:45 simmim_pretrain](main_simmim.py 218): INFO Train: [163/200][5000/6787] eta 0:07:19 lr 0.000200 time 0.2451 (0.2460) loss 0.3526 (0.3574) grad_norm 281228.7500 (inf) mem 14543MB +[2023-10-13 08:55:48 simmim_pretrain](main_simmim.py 218): INFO Train: [163/200][5500/6787] eta 0:05:16 lr 0.000200 time 0.2440 (0.2460) loss 0.3682 (0.3575) grad_norm 316905.0938 (inf) mem 14543MB +[2023-10-13 08:57:51 simmim_pretrain](main_simmim.py 218): INFO Train: [163/200][6000/6787] eta 0:03:13 lr 0.000200 time 0.2441 (0.2460) loss 0.3656 (0.3576) grad_norm 446959.0000 (inf) mem 14543MB +[2023-10-13 08:59:54 simmim_pretrain](main_simmim.py 218): INFO Train: [163/200][6500/6787] eta 0:01:10 lr 0.000200 time 0.2480 (0.2460) loss 0.3422 (0.3575) grad_norm 179075.6406 (inf) mem 14543MB +[2023-10-13 09:01:05 simmim_pretrain](main_simmim.py 228): INFO EPOCH 163 training takes 0:27:50 +[2023-10-13 09:01:06 simmim_pretrain](main_simmim.py 218): INFO Train: [164/200][0/6787] eta 2:24:26 lr 0.000200 time 1.2769 (1.2769) loss 0.3618 (0.3618) grad_norm 168501.8438 (168501.8438) mem 14543MB +[2023-10-13 09:03:09 simmim_pretrain](main_simmim.py 218): INFO Train: [164/200][500/6787] eta 0:25:58 lr 0.000200 time 0.2443 (0.2478) loss 0.3615 (0.3605) grad_norm 85595.3984 (150176.7031) mem 14543MB +[2023-10-13 09:05:12 simmim_pretrain](main_simmim.py 218): INFO Train: [164/200][1000/6787] eta 0:23:48 lr 0.000200 time 0.2439 (0.2468) loss 0.3568 (0.3603) grad_norm 138852.4062 (146163.7188) mem 14543MB +[2023-10-13 09:07:15 simmim_pretrain](main_simmim.py 218): INFO Train: [164/200][1500/6787] eta 0:21:42 lr 0.000200 time 0.2446 (0.2464) loss 0.3348 (0.3608) grad_norm 223223.5469 (140518.1719) mem 14543MB +[2023-10-13 09:09:18 simmim_pretrain](main_simmim.py 218): INFO Train: [164/200][2000/6787] eta 0:19:39 lr 0.000200 time 0.2453 (0.2463) loss 0.3638 (0.3606) grad_norm 137159.5000 (142774.9375) mem 14543MB +[2023-10-13 09:11:21 simmim_pretrain](main_simmim.py 218): INFO Train: [164/200][2500/6787] eta 0:17:35 lr 0.000200 time 0.2453 (0.2462) loss 0.3763 (0.3601) grad_norm 219539.3906 (153732.7031) mem 14543MB +[2023-10-13 09:13:24 simmim_pretrain](main_simmim.py 218): INFO Train: [164/200][3000/6787] eta 0:15:32 lr 0.000200 time 0.2444 (0.2462) loss 0.3500 (0.3596) grad_norm 191823.7188 (162009.1094) mem 14543MB +[2023-10-13 09:15:27 simmim_pretrain](main_simmim.py 218): INFO Train: [164/200][3500/6787] eta 0:13:28 lr 0.000200 time 0.2457 (0.2461) loss 0.3486 (0.3593) grad_norm 303283.9375 (169603.4375) mem 14543MB +[2023-10-13 09:17:30 simmim_pretrain](main_simmim.py 218): INFO Train: [164/200][4000/6787] eta 0:11:25 lr 0.000200 time 0.2446 (0.2461) loss 0.3649 (0.3591) grad_norm 538078.5625 (182905.6562) mem 14543MB +[2023-10-13 09:19:32 simmim_pretrain](main_simmim.py 218): INFO Train: [164/200][4500/6787] eta 0:09:22 lr 0.000200 time 0.2481 (0.2460) loss 0.3488 (0.3588) grad_norm 158370.2344 (204587.8906) mem 14543MB +[2023-10-13 09:21:35 simmim_pretrain](main_simmim.py 218): INFO Train: [164/200][5000/6787] eta 0:07:19 lr 0.000200 time 0.2451 (0.2460) loss 0.3587 (0.3586) grad_norm 334846.9375 (220143.5938) mem 14543MB +[2023-10-13 09:23:38 simmim_pretrain](main_simmim.py 218): INFO Train: [164/200][5500/6787] eta 0:05:16 lr 0.000200 time 0.2449 (0.2460) loss 0.3696 (0.3584) grad_norm 135668.7188 (inf) mem 14543MB +[2023-10-13 09:25:41 simmim_pretrain](main_simmim.py 218): INFO Train: [164/200][6000/6787] eta 0:03:13 lr 0.000200 time 0.2451 (0.2460) loss 0.3462 (0.3584) grad_norm 250772.6406 (inf) mem 14543MB +[2023-10-13 09:27:44 simmim_pretrain](main_simmim.py 218): INFO Train: [164/200][6500/6787] eta 0:01:10 lr 0.000200 time 0.2454 (0.2460) loss 0.3515 (0.3584) grad_norm 218418.6875 (inf) mem 14543MB +[2023-10-13 09:28:55 simmim_pretrain](main_simmim.py 228): INFO EPOCH 164 training takes 0:27:50 +[2023-10-13 09:28:56 simmim_pretrain](main_simmim.py 218): INFO Train: [165/200][0/6787] eta 2:23:51 lr 0.000200 time 1.2717 (1.2717) loss 0.3586 (0.3586) grad_norm 270000.0938 (270000.0938) mem 14543MB +[2023-10-13 09:30:59 simmim_pretrain](main_simmim.py 218): INFO Train: [165/200][500/6787] eta 0:25:57 lr 0.000200 time 0.2457 (0.2478) loss 0.3777 (0.3568) grad_norm 357105.6562 (259645.5938) mem 14543MB +[2023-10-13 09:33:02 simmim_pretrain](main_simmim.py 218): INFO Train: [165/200][1000/6787] eta 0:23:48 lr 0.000200 time 0.2455 (0.2468) loss 0.3699 (0.3569) grad_norm 258978.7344 (320131.6562) mem 14543MB +[2023-10-13 09:35:05 simmim_pretrain](main_simmim.py 218): INFO Train: [165/200][1500/6787] eta 0:21:42 lr 0.000200 time 0.2454 (0.2464) loss 0.3689 (0.3567) grad_norm 365289.1875 (342477.8125) mem 14543MB +[2023-10-13 09:37:08 simmim_pretrain](main_simmim.py 218): INFO Train: [165/200][2000/6787] eta 0:19:38 lr 0.000200 time 0.2452 (0.2463) loss 0.3476 (0.3570) grad_norm 371304.2500 (inf) mem 14543MB +[2023-10-13 09:39:11 simmim_pretrain](main_simmim.py 218): INFO Train: [165/200][2500/6787] eta 0:17:35 lr 0.000200 time 0.2460 (0.2462) loss 0.3434 (0.3572) grad_norm 179247.8281 (inf) mem 14543MB +[2023-10-13 09:41:14 simmim_pretrain](main_simmim.py 218): INFO Train: [165/200][3000/6787] eta 0:15:31 lr 0.000200 time 0.2452 (0.2461) loss 0.3721 (0.3573) grad_norm 224285.6562 (inf) mem 14543MB +[2023-10-13 09:43:17 simmim_pretrain](main_simmim.py 218): INFO Train: [165/200][3500/6787] eta 0:13:28 lr 0.000200 time 0.2452 (0.2460) loss 0.3483 (0.3575) grad_norm 289217.6562 (inf) mem 14543MB +[2023-10-13 09:45:20 simmim_pretrain](main_simmim.py 218): INFO Train: [165/200][4000/6787] eta 0:11:25 lr 0.000200 time 0.2459 (0.2460) loss 0.3396 (0.3575) grad_norm 422155.1562 (inf) mem 14543MB +[2023-10-13 09:47:24 simmim_pretrain](main_simmim.py 218): INFO Train: [165/200][4500/6787] eta 0:09:23 lr 0.000200 time 0.2495 (0.2463) loss 0.3639 (0.3574) grad_norm 325241.3750 (inf) mem 14543MB +[2023-10-13 09:49:30 simmim_pretrain](main_simmim.py 218): INFO Train: [165/200][5000/6787] eta 0:07:21 lr 0.000200 time 0.2516 (0.2469) loss 0.3333 (0.3572) grad_norm 321164.3750 (inf) mem 14543MB +[2023-10-13 09:51:37 simmim_pretrain](main_simmim.py 218): INFO Train: [165/200][5500/6787] eta 0:05:18 lr 0.000200 time 0.2465 (0.2475) loss 0.3337 (0.3572) grad_norm 280656.5938 (inf) mem 14543MB +[2023-10-13 09:53:43 simmim_pretrain](main_simmim.py 218): INFO Train: [165/200][6000/6787] eta 0:03:15 lr 0.000200 time 0.2550 (0.2480) loss 0.3470 (0.3573) grad_norm 328595.9375 (inf) mem 14543MB +[2023-10-13 09:55:53 simmim_pretrain](main_simmim.py 218): INFO Train: [165/200][6500/6787] eta 0:01:11 lr 0.000200 time 0.2611 (0.2489) loss 0.3561 (0.3575) grad_norm 171798.4375 (inf) mem 14543MB +[2023-10-13 09:57:08 simmim_pretrain](main_simmim.py 228): INFO EPOCH 165 training takes 0:28:13 +[2023-10-13 09:57:10 simmim_pretrain](main_simmim.py 218): INFO Train: [166/200][0/6787] eta 2:28:02 lr 0.000200 time 1.3088 (1.3088) loss 0.3554 (0.3554) grad_norm 133739.7344 (133739.7344) mem 14543MB +[2023-10-13 09:59:14 simmim_pretrain](main_simmim.py 218): INFO Train: [166/200][500/6787] eta 0:26:13 lr 0.000200 time 0.2464 (0.2503) loss 0.3597 (0.3599) grad_norm 176402.4844 (139173.7656) mem 14543MB +[2023-10-13 10:01:18 simmim_pretrain](main_simmim.py 218): INFO Train: [166/200][1000/6787] eta 0:24:01 lr 0.000200 time 0.2460 (0.2490) loss 0.3671 (0.3600) grad_norm 199878.1406 (134413.2500) mem 14543MB +[2023-10-13 10:03:21 simmim_pretrain](main_simmim.py 218): INFO Train: [166/200][1500/6787] eta 0:21:53 lr 0.000200 time 0.2458 (0.2484) loss 0.3691 (0.3601) grad_norm 141788.3750 (133173.8125) mem 14543MB +[2023-10-13 10:05:25 simmim_pretrain](main_simmim.py 218): INFO Train: [166/200][2000/6787] eta 0:19:47 lr 0.000200 time 0.2466 (0.2481) loss 0.3370 (0.3597) grad_norm 186883.1562 (144039.6250) mem 14543MB +[2023-10-13 10:07:29 simmim_pretrain](main_simmim.py 218): INFO Train: [166/200][2500/6787] eta 0:17:44 lr 0.000200 time 0.2453 (0.2483) loss 0.3509 (0.3595) grad_norm 131602.8906 (154643.1094) mem 14543MB +[2023-10-13 10:09:34 simmim_pretrain](main_simmim.py 218): INFO Train: [166/200][3000/6787] eta 0:15:41 lr 0.000200 time 0.2504 (0.2485) loss 0.3678 (0.3593) grad_norm 295628.7500 (164803.7656) mem 14543MB +[2023-10-13 10:11:40 simmim_pretrain](main_simmim.py 218): INFO Train: [166/200][3500/6787] eta 0:13:38 lr 0.000200 time 0.2484 (0.2491) loss 0.3935 (0.3592) grad_norm 166350.6875 (173726.8750) mem 14543MB +[2023-10-13 10:13:48 simmim_pretrain](main_simmim.py 218): INFO Train: [166/200][4000/6787] eta 0:11:36 lr 0.000200 time 0.2489 (0.2498) loss 0.3443 (0.3589) grad_norm 337484.4375 (205526.6562) mem 14543MB +[2023-10-13 10:15:55 simmim_pretrain](main_simmim.py 218): INFO Train: [166/200][4500/6787] eta 0:09:32 lr 0.000200 time 0.2575 (0.2504) loss 0.3418 (0.3587) grad_norm 360448.0000 (219656.7344) mem 14543MB +[2023-10-13 10:18:04 simmim_pretrain](main_simmim.py 218): INFO Train: [166/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2604 (0.2511) loss 0.3646 (0.3583) grad_norm 472809.2812 (233837.3125) mem 14543MB +[2023-10-13 10:20:12 simmim_pretrain](main_simmim.py 218): INFO Train: [166/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2513 (0.2516) loss 0.3422 (0.3582) grad_norm 334329.8750 (250818.3594) mem 14543MB +[2023-10-13 10:22:20 simmim_pretrain](main_simmim.py 218): INFO Train: [166/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2576 (0.2519) loss 0.3632 (0.3580) grad_norm 202576.9531 (inf) mem 14543MB +[2023-10-13 10:24:28 simmim_pretrain](main_simmim.py 218): INFO Train: [166/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2601 (0.2522) loss 0.3587 (0.3579) grad_norm 242910.6562 (inf) mem 14543MB +[2023-10-13 10:25:43 simmim_pretrain](main_simmim.py 228): INFO EPOCH 166 training takes 0:28:34 +[2023-10-13 10:25:44 simmim_pretrain](main_simmim.py 218): INFO Train: [167/200][0/6787] eta 2:30:36 lr 0.000200 time 1.3314 (1.3314) loss 0.3587 (0.3587) grad_norm 221753.3906 (221753.3906) mem 14543MB +[2023-10-13 10:27:51 simmim_pretrain](main_simmim.py 218): INFO Train: [167/200][500/6787] eta 0:26:45 lr 0.000200 time 0.2515 (0.2554) loss 0.3706 (0.3585) grad_norm 220825.6875 (270814.2500) mem 14543MB +[2023-10-13 10:29:57 simmim_pretrain](main_simmim.py 218): INFO Train: [167/200][1000/6787] eta 0:24:30 lr 0.000200 time 0.2466 (0.2540) loss 0.3554 (0.3589) grad_norm 313616.3438 (260036.7500) mem 14543MB +[2023-10-13 10:32:03 simmim_pretrain](main_simmim.py 218): INFO Train: [167/200][1500/6787] eta 0:22:19 lr 0.000200 time 0.2469 (0.2534) loss 0.3893 (0.3585) grad_norm 417989.4062 (255936.3438) mem 14543MB +[2023-10-13 10:34:09 simmim_pretrain](main_simmim.py 218): INFO Train: [167/200][2000/6787] eta 0:20:10 lr 0.000200 time 0.2518 (0.2528) loss 0.3556 (0.3585) grad_norm 197368.0156 (273454.9375) mem 14543MB +[2023-10-13 10:36:14 simmim_pretrain](main_simmim.py 218): INFO Train: [167/200][2500/6787] eta 0:18:02 lr 0.000200 time 0.2521 (0.2524) loss 0.3795 (0.3580) grad_norm 474783.0000 (298285.5312) mem 14543MB +[2023-10-13 10:38:19 simmim_pretrain](main_simmim.py 218): INFO Train: [167/200][3000/6787] eta 0:15:54 lr 0.000200 time 0.2466 (0.2520) loss 0.3419 (0.3576) grad_norm 341850.7188 (329990.2500) mem 14543MB +[2023-10-13 10:40:24 simmim_pretrain](main_simmim.py 218): INFO Train: [167/200][3500/6787] eta 0:13:47 lr 0.000200 time 0.2449 (0.2517) loss 0.3692 (0.3574) grad_norm 318762.3438 (inf) mem 14543MB +[2023-10-13 10:42:29 simmim_pretrain](main_simmim.py 218): INFO Train: [167/200][4000/6787] eta 0:11:40 lr 0.000200 time 0.2479 (0.2514) loss 0.3701 (0.3575) grad_norm 402413.1875 (inf) mem 14543MB +[2023-10-13 10:44:34 simmim_pretrain](main_simmim.py 218): INFO Train: [167/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2461 (0.2512) loss 0.3485 (0.3577) grad_norm 216998.4688 (inf) mem 14543MB +[2023-10-13 10:46:39 simmim_pretrain](main_simmim.py 218): INFO Train: [167/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2535 (0.2511) loss 0.3470 (0.3578) grad_norm 156082.0312 (inf) mem 14543MB +[2023-10-13 10:48:44 simmim_pretrain](main_simmim.py 218): INFO Train: [167/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2458 (0.2510) loss 0.3458 (0.3579) grad_norm 79119.3750 (inf) mem 14543MB +[2023-10-13 10:50:49 simmim_pretrain](main_simmim.py 218): INFO Train: [167/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2574 (0.2510) loss 0.3592 (0.3582) grad_norm 79177.7578 (inf) mem 14543MB +[2023-10-13 10:52:55 simmim_pretrain](main_simmim.py 218): INFO Train: [167/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2499 (0.2510) loss 0.3667 (0.3585) grad_norm 122780.8281 (inf) mem 14543MB +[2023-10-13 10:54:07 simmim_pretrain](main_simmim.py 228): INFO EPOCH 167 training takes 0:28:24 +[2023-10-13 10:54:09 simmim_pretrain](main_simmim.py 218): INFO Train: [168/200][0/6787] eta 3:00:22 lr 0.000200 time 1.5945 (1.5945) loss 0.3890 (0.3890) grad_norm 61192.7109 (61192.7109) mem 14543MB +[2023-10-13 10:56:14 simmim_pretrain](main_simmim.py 218): INFO Train: [168/200][500/6787] eta 0:26:35 lr 0.000200 time 0.2534 (0.2537) loss 0.3589 (0.3604) grad_norm 159491.5156 (131847.3438) mem 14543MB +[2023-10-13 10:58:20 simmim_pretrain](main_simmim.py 218): INFO Train: [168/200][1000/6787] eta 0:24:21 lr 0.000200 time 0.2546 (0.2526) loss 0.3582 (0.3597) grad_norm 208774.1562 (154517.2344) mem 14543MB +[2023-10-13 11:00:26 simmim_pretrain](main_simmim.py 218): INFO Train: [168/200][1500/6787] eta 0:22:14 lr 0.000200 time 0.2477 (0.2524) loss 0.3559 (0.3592) grad_norm 233936.6875 (168350.3438) mem 14543MB +[2023-10-13 11:02:32 simmim_pretrain](main_simmim.py 218): INFO Train: [168/200][2000/6787] eta 0:20:07 lr 0.000200 time 0.2464 (0.2523) loss 0.3591 (0.3587) grad_norm 137638.8750 (178147.2031) mem 14543MB +[2023-10-13 11:04:38 simmim_pretrain](main_simmim.py 218): INFO Train: [168/200][2500/6787] eta 0:18:01 lr 0.000200 time 0.2505 (0.2523) loss 0.3473 (0.3586) grad_norm 185826.8594 (185974.0781) mem 14543MB +[2023-10-13 11:06:44 simmim_pretrain](main_simmim.py 218): INFO Train: [168/200][3000/6787] eta 0:15:55 lr 0.000200 time 0.2523 (0.2523) loss 0.3572 (0.3583) grad_norm 357885.2188 (210770.6250) mem 14543MB +[2023-10-13 11:08:51 simmim_pretrain](main_simmim.py 218): INFO Train: [168/200][3500/6787] eta 0:13:49 lr 0.000200 time 0.2558 (0.2523) loss 0.3564 (0.3580) grad_norm 466469.1562 (232267.9844) mem 14543MB +[2023-10-13 11:10:57 simmim_pretrain](main_simmim.py 218): INFO Train: [168/200][4000/6787] eta 0:11:43 lr 0.000200 time 0.2459 (0.2523) loss 0.3366 (0.3579) grad_norm 369993.4062 (257380.0469) mem 14543MB +[2023-10-13 11:13:03 simmim_pretrain](main_simmim.py 218): INFO Train: [168/200][4500/6787] eta 0:09:36 lr 0.000200 time 0.2585 (0.2523) loss 0.3393 (0.3578) grad_norm 335482.3125 (274836.0938) mem 14543MB +[2023-10-13 11:15:08 simmim_pretrain](main_simmim.py 218): INFO Train: [168/200][5000/6787] eta 0:07:30 lr 0.000200 time 0.2506 (0.2522) loss 0.3469 (0.3576) grad_norm 1160416.5000 (301340.6562) mem 14543MB +[2023-10-13 11:17:14 simmim_pretrain](main_simmim.py 218): INFO Train: [168/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2462 (0.2521) loss 0.3541 (0.3575) grad_norm 357477.4375 (inf) mem 14543MB +[2023-10-13 11:19:20 simmim_pretrain](main_simmim.py 218): INFO Train: [168/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2526 (0.2520) loss 0.3571 (0.3576) grad_norm 140712.5938 (inf) mem 14543MB +[2023-10-13 11:21:25 simmim_pretrain](main_simmim.py 218): INFO Train: [168/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2472 (0.2520) loss 0.3422 (0.3576) grad_norm 302327.0000 (inf) mem 14543MB +[2023-10-13 11:22:38 simmim_pretrain](main_simmim.py 228): INFO EPOCH 168 training takes 0:28:30 +[2023-10-13 11:22:39 simmim_pretrain](main_simmim.py 218): INFO Train: [169/200][0/6787] eta 3:01:30 lr 0.000200 time 1.6046 (1.6046) loss 0.3356 (0.3356) grad_norm 222485.7188 (222485.7188) mem 14543MB +[2023-10-13 11:24:44 simmim_pretrain](main_simmim.py 218): INFO Train: [169/200][500/6787] eta 0:26:30 lr 0.000200 time 0.2511 (0.2530) loss 0.3594 (0.3578) grad_norm 366837.8750 (259586.3281) mem 14543MB +[2023-10-13 11:26:50 simmim_pretrain](main_simmim.py 218): INFO Train: [169/200][1000/6787] eta 0:24:16 lr 0.000200 time 0.2533 (0.2517) loss 0.3321 (0.3574) grad_norm 290214.8125 (299226.5000) mem 14543MB +[2023-10-13 11:28:55 simmim_pretrain](main_simmim.py 218): INFO Train: [169/200][1500/6787] eta 0:22:08 lr 0.000200 time 0.2488 (0.2513) loss 0.3635 (0.3570) grad_norm 475944.4375 (330011.9062) mem 14543MB +[2023-10-13 11:31:00 simmim_pretrain](main_simmim.py 218): INFO Train: [169/200][2000/6787] eta 0:20:02 lr 0.000200 time 0.2598 (0.2513) loss 0.3659 (0.3568) grad_norm 316713.4375 (inf) mem 14543MB +[2023-10-13 11:33:06 simmim_pretrain](main_simmim.py 218): INFO Train: [169/200][2500/6787] eta 0:17:57 lr 0.000200 time 0.2455 (0.2513) loss 0.3620 (0.3569) grad_norm 311703.8125 (inf) mem 14543MB +[2023-10-13 11:35:12 simmim_pretrain](main_simmim.py 218): INFO Train: [169/200][3000/6787] eta 0:15:51 lr 0.000200 time 0.2528 (0.2513) loss 0.3282 (0.3571) grad_norm 450430.1875 (inf) mem 14543MB +[2023-10-13 11:37:18 simmim_pretrain](main_simmim.py 218): INFO Train: [169/200][3500/6787] eta 0:13:46 lr 0.000200 time 0.2470 (0.2515) loss 0.5572 (0.3589) grad_norm 40141.7773 (inf) mem 14543MB +[2023-10-13 11:39:24 simmim_pretrain](main_simmim.py 218): INFO Train: [169/200][4000/6787] eta 0:11:41 lr 0.000200 time 0.2529 (0.2516) loss 0.3605 (0.3681) grad_norm 55909.1328 (inf) mem 14543MB +[2023-10-13 11:41:30 simmim_pretrain](main_simmim.py 218): INFO Train: [169/200][4500/6787] eta 0:09:35 lr 0.000200 time 0.2539 (0.2517) loss 0.3421 (0.3682) grad_norm 44270.9531 (inf) mem 14543MB +[2023-10-13 11:43:37 simmim_pretrain](main_simmim.py 218): INFO Train: [169/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2483 (0.2518) loss 0.3629 (0.3681) grad_norm 21967.7871 (inf) mem 14543MB +[2023-10-13 11:45:43 simmim_pretrain](main_simmim.py 218): INFO Train: [169/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2584 (0.2518) loss 0.3656 (0.3678) grad_norm 22016.1504 (inf) mem 14543MB +[2023-10-13 11:47:49 simmim_pretrain](main_simmim.py 218): INFO Train: [169/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2529 (0.2518) loss 0.3860 (0.3674) grad_norm 51017.5742 (inf) mem 14543MB +[2023-10-13 11:49:55 simmim_pretrain](main_simmim.py 218): INFO Train: [169/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2467 (0.2519) loss 0.3502 (0.3669) grad_norm 58240.2656 (inf) mem 14543MB +[2023-10-13 11:51:08 simmim_pretrain](main_simmim.py 228): INFO EPOCH 169 training takes 0:28:30 +[2023-10-13 11:51:09 simmim_pretrain](main_simmim.py 218): INFO Train: [170/200][0/6787] eta 2:43:37 lr 0.000200 time 1.4465 (1.4465) loss 0.3507 (0.3507) grad_norm 58643.3359 (58643.3359) mem 14543MB +[2023-10-13 11:53:15 simmim_pretrain](main_simmim.py 218): INFO Train: [170/200][500/6787] eta 0:26:32 lr 0.000200 time 0.2513 (0.2534) loss 0.3657 (0.3615) grad_norm 40290.1523 (48998.8633) mem 14543MB +[2023-10-13 11:55:20 simmim_pretrain](main_simmim.py 218): INFO Train: [170/200][1000/6787] eta 0:24:21 lr 0.000200 time 0.2517 (0.2525) loss 0.3525 (0.3611) grad_norm 37085.3789 (53129.5859) mem 14543MB +[2023-10-13 11:57:26 simmim_pretrain](main_simmim.py 218): INFO Train: [170/200][1500/6787] eta 0:22:12 lr 0.000200 time 0.2540 (0.2521) loss 0.3643 (0.3606) grad_norm 91229.5312 (59043.3438) mem 14543MB +[2023-10-13 11:59:32 simmim_pretrain](main_simmim.py 218): INFO Train: [170/200][2000/6787] eta 0:20:05 lr 0.000200 time 0.2469 (0.2518) loss 0.3761 (0.3599) grad_norm 76814.6484 (64850.3672) mem 14543MB +[2023-10-13 12:01:37 simmim_pretrain](main_simmim.py 218): INFO Train: [170/200][2500/6787] eta 0:17:58 lr 0.000200 time 0.2479 (0.2517) loss 0.3625 (0.3595) grad_norm 99347.4531 (69596.9688) mem 14543MB +[2023-10-13 12:03:43 simmim_pretrain](main_simmim.py 218): INFO Train: [170/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2465 (0.2516) loss 0.3423 (0.3594) grad_norm 95042.6016 (75063.4531) mem 14543MB +[2023-10-13 12:05:48 simmim_pretrain](main_simmim.py 218): INFO Train: [170/200][3500/6787] eta 0:13:46 lr 0.000200 time 0.2584 (0.2515) loss 0.3459 (0.3592) grad_norm 92178.9688 (82647.7812) mem 14543MB +[2023-10-13 12:07:54 simmim_pretrain](main_simmim.py 218): INFO Train: [170/200][4000/6787] eta 0:11:40 lr 0.000200 time 0.2481 (0.2515) loss 0.3740 (0.3589) grad_norm 154235.5000 (90068.5781) mem 14543MB +[2023-10-13 12:09:59 simmim_pretrain](main_simmim.py 218): INFO Train: [170/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2464 (0.2514) loss 0.3386 (0.3587) grad_norm 237254.6875 (98517.3438) mem 14543MB +[2023-10-13 12:12:05 simmim_pretrain](main_simmim.py 218): INFO Train: [170/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2562 (0.2514) loss 0.3495 (0.3586) grad_norm 195652.6719 (109687.5781) mem 14543MB +[2023-10-13 12:14:10 simmim_pretrain](main_simmim.py 218): INFO Train: [170/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2530 (0.2513) loss 0.3794 (0.3585) grad_norm 410068.5625 (125423.1797) mem 14543MB +[2023-10-13 12:16:16 simmim_pretrain](main_simmim.py 218): INFO Train: [170/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2496 (0.2513) loss 0.3532 (0.3583) grad_norm 216391.1562 (139856.7969) mem 14543MB +[2023-10-13 12:18:21 simmim_pretrain](main_simmim.py 218): INFO Train: [170/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2527 (0.2513) loss 0.3402 (0.3582) grad_norm 397446.0312 (153151.2812) mem 14543MB +[2023-10-13 12:19:34 simmim_pretrain](main_simmim.py 228): INFO EPOCH 170 training takes 0:28:26 +[2023-10-13 12:19:36 simmim_pretrain](main_simmim.py 218): INFO Train: [171/200][0/6787] eta 3:01:10 lr 0.000200 time 1.6017 (1.6017) loss 0.3693 (0.3693) grad_norm 466116.8125 (466116.8125) mem 14543MB +[2023-10-13 12:21:41 simmim_pretrain](main_simmim.py 218): INFO Train: [171/200][500/6787] eta 0:26:35 lr 0.000200 time 0.2495 (0.2538) loss 0.3490 (0.3578) grad_norm 424978.3438 (338295.4375) mem 14543MB +[2023-10-13 12:23:47 simmim_pretrain](main_simmim.py 218): INFO Train: [171/200][1000/6787] eta 0:24:23 lr 0.000200 time 0.2521 (0.2529) loss 0.3701 (0.3570) grad_norm 395261.7500 (345965.4688) mem 14543MB +[2023-10-13 12:25:53 simmim_pretrain](main_simmim.py 218): INFO Train: [171/200][1500/6787] eta 0:22:16 lr 0.000200 time 0.2589 (0.2528) loss 0.3567 (0.3568) grad_norm 325881.0625 (348855.5312) mem 14543MB +[2023-10-13 12:28:00 simmim_pretrain](main_simmim.py 218): INFO Train: [171/200][2000/6787] eta 0:20:09 lr 0.000200 time 0.2546 (0.2527) loss 0.3614 (0.3564) grad_norm 841634.9375 (355705.6250) mem 14543MB +[2023-10-13 12:30:06 simmim_pretrain](main_simmim.py 218): INFO Train: [171/200][2500/6787] eta 0:18:03 lr 0.000200 time 0.2454 (0.2527) loss 0.3485 (0.3564) grad_norm 521801.4688 (inf) mem 14543MB +[2023-10-13 12:32:12 simmim_pretrain](main_simmim.py 218): INFO Train: [171/200][3000/6787] eta 0:15:57 lr 0.000200 time 0.2525 (0.2527) loss 0.3565 (0.3563) grad_norm 465554.5312 (inf) mem 14543MB +[2023-10-13 12:34:19 simmim_pretrain](main_simmim.py 218): INFO Train: [171/200][3500/6787] eta 0:13:50 lr 0.000200 time 0.2515 (0.2527) loss 0.3585 (0.3562) grad_norm 464424.2500 (inf) mem 14543MB +[2023-10-13 12:36:25 simmim_pretrain](main_simmim.py 218): INFO Train: [171/200][4000/6787] eta 0:11:44 lr 0.000200 time 0.2529 (0.2527) loss 0.3468 (0.3561) grad_norm 475429.9375 (inf) mem 14543MB +[2023-10-13 12:38:31 simmim_pretrain](main_simmim.py 218): INFO Train: [171/200][4500/6787] eta 0:09:37 lr 0.000200 time 0.2504 (0.2526) loss 0.3617 (0.3559) grad_norm 464281.5000 (inf) mem 14543MB +[2023-10-13 12:40:37 simmim_pretrain](main_simmim.py 218): INFO Train: [171/200][5000/6787] eta 0:07:31 lr 0.000200 time 0.2538 (0.2525) loss 0.3614 (0.3559) grad_norm 432284.7188 (inf) mem 14543MB +[2023-10-13 12:42:43 simmim_pretrain](main_simmim.py 218): INFO Train: [171/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2522 (0.2524) loss 0.3652 (0.3560) grad_norm 281701.3750 (inf) mem 14543MB +[2023-10-13 12:44:48 simmim_pretrain](main_simmim.py 218): INFO Train: [171/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2528 (0.2524) loss 0.3715 (0.3560) grad_norm 299196.1875 (inf) mem 14543MB +[2023-10-13 12:46:54 simmim_pretrain](main_simmim.py 218): INFO Train: [171/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2502 (0.2523) loss 0.3581 (0.3561) grad_norm 112424.3359 (inf) mem 14543MB +[2023-10-13 12:48:07 simmim_pretrain](main_simmim.py 228): INFO EPOCH 171 training takes 0:28:32 +[2023-10-13 12:48:08 simmim_pretrain](main_simmim.py 218): INFO Train: [172/200][0/6787] eta 2:52:41 lr 0.000200 time 1.5267 (1.5267) loss 0.3633 (0.3633) grad_norm 134335.3125 (134335.3125) mem 14543MB +[2023-10-13 12:50:14 simmim_pretrain](main_simmim.py 218): INFO Train: [172/200][500/6787] eta 0:26:33 lr 0.000200 time 0.2511 (0.2535) loss 0.3573 (0.3568) grad_norm 281342.1875 (214038.3125) mem 14543MB +[2023-10-13 12:52:19 simmim_pretrain](main_simmim.py 218): INFO Train: [172/200][1000/6787] eta 0:24:19 lr 0.000200 time 0.2500 (0.2522) loss 0.3687 (0.3572) grad_norm 235114.5469 (215037.8594) mem 14543MB +[2023-10-13 12:54:25 simmim_pretrain](main_simmim.py 218): INFO Train: [172/200][1500/6787] eta 0:22:11 lr 0.000200 time 0.2508 (0.2518) loss 0.3305 (0.3567) grad_norm 389761.3750 (235739.2344) mem 14543MB +[2023-10-13 12:56:31 simmim_pretrain](main_simmim.py 218): INFO Train: [172/200][2000/6787] eta 0:20:05 lr 0.000200 time 0.2482 (0.2517) loss 0.3575 (0.3567) grad_norm 364430.0938 (257126.1562) mem 14543MB +[2023-10-13 12:58:36 simmim_pretrain](main_simmim.py 218): INFO Train: [172/200][2500/6787] eta 0:17:58 lr 0.000200 time 0.2537 (0.2516) loss 0.3557 (0.3564) grad_norm 432418.4688 (273453.0000) mem 14543MB +[2023-10-13 13:00:42 simmim_pretrain](main_simmim.py 218): INFO Train: [172/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2486 (0.2516) loss 0.3446 (0.3564) grad_norm 220069.6250 (inf) mem 14543MB +[2023-10-13 13:02:47 simmim_pretrain](main_simmim.py 218): INFO Train: [172/200][3500/6787] eta 0:13:46 lr 0.000200 time 0.2512 (0.2515) loss 0.3554 (0.3566) grad_norm 158950.5938 (inf) mem 14543MB +[2023-10-13 13:04:53 simmim_pretrain](main_simmim.py 218): INFO Train: [172/200][4000/6787] eta 0:11:40 lr 0.000200 time 0.2517 (0.2514) loss 0.3448 (0.3567) grad_norm 299793.8750 (inf) mem 14543MB +[2023-10-13 13:06:58 simmim_pretrain](main_simmim.py 218): INFO Train: [172/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2491 (0.2513) loss 0.3577 (0.3567) grad_norm 121309.2734 (inf) mem 14543MB +[2023-10-13 13:09:04 simmim_pretrain](main_simmim.py 218): INFO Train: [172/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2521 (0.2513) loss 0.3635 (0.3568) grad_norm 146870.6094 (inf) mem 14543MB +[2023-10-13 13:11:09 simmim_pretrain](main_simmim.py 218): INFO Train: [172/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2502 (0.2513) loss 0.3662 (0.3567) grad_norm 697606.4375 (inf) mem 14543MB +[2023-10-13 13:13:15 simmim_pretrain](main_simmim.py 218): INFO Train: [172/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2504 (0.2514) loss 0.3408 (0.3567) grad_norm 417194.5312 (inf) mem 14543MB +[2023-10-13 13:15:21 simmim_pretrain](main_simmim.py 218): INFO Train: [172/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2482 (0.2514) loss 0.3290 (0.3566) grad_norm 464593.3750 (inf) mem 14543MB +[2023-10-13 13:16:34 simmim_pretrain](main_simmim.py 228): INFO EPOCH 172 training takes 0:28:27 +[2023-10-13 13:16:36 simmim_pretrain](main_simmim.py 218): INFO Train: [173/200][0/6787] eta 2:48:38 lr 0.000200 time 1.4908 (1.4908) loss 0.3410 (0.3410) grad_norm 364327.1562 (364327.1562) mem 14543MB +[2023-10-13 13:18:42 simmim_pretrain](main_simmim.py 218): INFO Train: [173/200][500/6787] eta 0:26:41 lr 0.000200 time 0.2516 (0.2548) loss 0.3250 (0.3559) grad_norm 374274.0000 (395520.8750) mem 14543MB +[2023-10-13 13:20:48 simmim_pretrain](main_simmim.py 218): INFO Train: [173/200][1000/6787] eta 0:24:27 lr 0.000200 time 0.2467 (0.2535) loss 0.3637 (0.3552) grad_norm 561121.0000 (405394.1250) mem 14543MB +[2023-10-13 13:22:54 simmim_pretrain](main_simmim.py 218): INFO Train: [173/200][1500/6787] eta 0:22:18 lr 0.000200 time 0.2533 (0.2532) loss 0.3391 (0.3555) grad_norm 238660.5156 (inf) mem 14543MB +[2023-10-13 13:25:01 simmim_pretrain](main_simmim.py 218): INFO Train: [173/200][2000/6787] eta 0:20:11 lr 0.000200 time 0.2556 (0.2530) loss 0.3636 (0.3561) grad_norm 214211.0000 (inf) mem 14543MB +[2023-10-13 13:27:07 simmim_pretrain](main_simmim.py 218): INFO Train: [173/200][2500/6787] eta 0:18:03 lr 0.000200 time 0.2502 (0.2528) loss 0.3906 (0.3565) grad_norm 212856.2812 (inf) mem 14543MB +[2023-10-13 13:29:13 simmim_pretrain](main_simmim.py 218): INFO Train: [173/200][3000/6787] eta 0:15:56 lr 0.000200 time 0.2525 (0.2526) loss 0.3737 (0.3568) grad_norm 177084.5469 (inf) mem 14543MB +[2023-10-13 13:31:18 simmim_pretrain](main_simmim.py 218): INFO Train: [173/200][3500/6787] eta 0:13:49 lr 0.000200 time 0.2496 (0.2525) loss 0.3302 (0.3570) grad_norm 161417.3750 (inf) mem 14543MB +[2023-10-13 13:33:24 simmim_pretrain](main_simmim.py 218): INFO Train: [173/200][4000/6787] eta 0:11:43 lr 0.000200 time 0.2530 (0.2523) loss 0.3739 (0.3569) grad_norm 320008.7812 (inf) mem 14543MB +[2023-10-13 13:35:29 simmim_pretrain](main_simmim.py 218): INFO Train: [173/200][4500/6787] eta 0:09:36 lr 0.000200 time 0.2526 (0.2521) loss 0.3397 (0.3569) grad_norm 146682.1406 (inf) mem 14543MB +[2023-10-13 13:37:35 simmim_pretrain](main_simmim.py 218): INFO Train: [173/200][5000/6787] eta 0:07:30 lr 0.000200 time 0.2498 (0.2520) loss 0.3433 (0.3570) grad_norm 162987.9375 (inf) mem 14543MB +[2023-10-13 13:39:40 simmim_pretrain](main_simmim.py 218): INFO Train: [173/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2542 (0.2518) loss 0.3443 (0.3571) grad_norm 94976.4062 (inf) mem 14543MB +[2023-10-13 13:41:45 simmim_pretrain](main_simmim.py 218): INFO Train: [173/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2488 (0.2517) loss 0.3680 (0.3575) grad_norm 142997.5312 (inf) mem 14543MB +[2023-10-13 13:43:50 simmim_pretrain](main_simmim.py 218): INFO Train: [173/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2546 (0.2516) loss 0.3529 (0.3576) grad_norm 106500.7188 (inf) mem 14543MB +[2023-10-13 13:45:02 simmim_pretrain](main_simmim.py 228): INFO EPOCH 173 training takes 0:28:28 +[2023-10-13 13:45:04 simmim_pretrain](main_simmim.py 218): INFO Train: [174/200][0/6787] eta 2:57:04 lr 0.000200 time 1.5654 (1.5654) loss 0.3643 (0.3643) grad_norm 161605.8125 (161605.8125) mem 14543MB +[2023-10-13 13:47:09 simmim_pretrain](main_simmim.py 218): INFO Train: [174/200][500/6787] eta 0:26:28 lr 0.000200 time 0.2467 (0.2526) loss 0.3632 (0.3582) grad_norm 119358.9453 (142749.9062) mem 14543MB +[2023-10-13 13:49:15 simmim_pretrain](main_simmim.py 218): INFO Train: [174/200][1000/6787] eta 0:24:18 lr 0.000200 time 0.2469 (0.2521) loss 0.3434 (0.3576) grad_norm 205515.5938 (157318.2500) mem 14543MB +[2023-10-13 13:51:21 simmim_pretrain](main_simmim.py 218): INFO Train: [174/200][1500/6787] eta 0:22:11 lr 0.000200 time 0.2489 (0.2519) loss 0.3750 (0.3576) grad_norm 245292.9219 (164527.9219) mem 14543MB +[2023-10-13 13:53:27 simmim_pretrain](main_simmim.py 218): INFO Train: [174/200][2000/6787] eta 0:20:06 lr 0.000200 time 0.2465 (0.2520) loss 0.3635 (0.3576) grad_norm 197961.8438 (inf) mem 14543MB +[2023-10-13 13:55:33 simmim_pretrain](main_simmim.py 218): INFO Train: [174/200][2500/6787] eta 0:18:00 lr 0.000200 time 0.2535 (0.2520) loss 0.3595 (0.3579) grad_norm 101259.0312 (inf) mem 14543MB +[2023-10-13 13:57:39 simmim_pretrain](main_simmim.py 218): INFO Train: [174/200][3000/6787] eta 0:15:54 lr 0.000200 time 0.2562 (0.2521) loss 0.3660 (0.3583) grad_norm 155792.5781 (inf) mem 14543MB +[2023-10-13 13:59:45 simmim_pretrain](main_simmim.py 218): INFO Train: [174/200][3500/6787] eta 0:13:48 lr 0.000200 time 0.2456 (0.2522) loss 0.3399 (0.3586) grad_norm 113733.9609 (inf) mem 14543MB +[2023-10-13 14:01:52 simmim_pretrain](main_simmim.py 218): INFO Train: [174/200][4000/6787] eta 0:11:43 lr 0.000200 time 0.2578 (0.2523) loss 0.3350 (0.3588) grad_norm 154773.9375 (inf) mem 14543MB +[2023-10-13 14:03:58 simmim_pretrain](main_simmim.py 218): INFO Train: [174/200][4500/6787] eta 0:09:37 lr 0.000200 time 0.2562 (0.2523) loss 0.3624 (0.3587) grad_norm 98354.1719 (inf) mem 14543MB +[2023-10-13 14:06:04 simmim_pretrain](main_simmim.py 218): INFO Train: [174/200][5000/6787] eta 0:07:30 lr 0.000200 time 0.2562 (0.2523) loss 0.3550 (0.3585) grad_norm 200539.7812 (inf) mem 14543MB +[2023-10-13 14:08:11 simmim_pretrain](main_simmim.py 218): INFO Train: [174/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2539 (0.2524) loss 0.3701 (0.3584) grad_norm 204500.5938 (inf) mem 14543MB +[2023-10-13 14:10:17 simmim_pretrain](main_simmim.py 218): INFO Train: [174/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2513 (0.2524) loss 0.3585 (0.3584) grad_norm 298652.7188 (inf) mem 14543MB +[2023-10-13 14:12:24 simmim_pretrain](main_simmim.py 218): INFO Train: [174/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2520 (0.2525) loss 0.3551 (0.3582) grad_norm 346487.5938 (inf) mem 14543MB +[2023-10-13 14:13:37 simmim_pretrain](main_simmim.py 228): INFO EPOCH 174 training takes 0:28:34 +[2023-10-13 14:13:38 simmim_pretrain](main_simmim.py 218): INFO Train: [175/200][0/6787] eta 2:40:12 lr 0.000200 time 1.4163 (1.4163) loss 0.3526 (0.3526) grad_norm 171432.5312 (171432.5312) mem 14543MB +[2023-10-13 14:15:44 simmim_pretrain](main_simmim.py 218): INFO Train: [175/200][500/6787] eta 0:26:35 lr 0.000200 time 0.2517 (0.2538) loss 0.3681 (0.3567) grad_norm 340015.5625 (322708.9688) mem 14543MB +[2023-10-13 14:17:50 simmim_pretrain](main_simmim.py 218): INFO Train: [175/200][1000/6787] eta 0:24:23 lr 0.000200 time 0.2521 (0.2528) loss 0.3775 (0.3565) grad_norm 337308.0000 (373191.3750) mem 14543MB +[2023-10-13 14:19:56 simmim_pretrain](main_simmim.py 218): INFO Train: [175/200][1500/6787] eta 0:22:15 lr 0.000200 time 0.2481 (0.2525) loss 0.3501 (0.3564) grad_norm 315614.5000 (inf) mem 14543MB +[2023-10-13 14:22:01 simmim_pretrain](main_simmim.py 218): INFO Train: [175/200][2000/6787] eta 0:20:07 lr 0.000200 time 0.2457 (0.2522) loss 0.3634 (0.3563) grad_norm 495129.0938 (inf) mem 14543MB +[2023-10-13 14:24:07 simmim_pretrain](main_simmim.py 218): INFO Train: [175/200][2500/6787] eta 0:18:00 lr 0.000200 time 0.2499 (0.2520) loss 0.3562 (0.3561) grad_norm 300935.6875 (inf) mem 14543MB +[2023-10-13 14:26:12 simmim_pretrain](main_simmim.py 218): INFO Train: [175/200][3000/6787] eta 0:15:53 lr 0.000200 time 0.2490 (0.2518) loss 0.3837 (0.3562) grad_norm 295160.4688 (inf) mem 14543MB +[2023-10-13 14:28:18 simmim_pretrain](main_simmim.py 218): INFO Train: [175/200][3500/6787] eta 0:13:47 lr 0.000200 time 0.2464 (0.2517) loss 0.3642 (0.3560) grad_norm 323705.7500 (inf) mem 14543MB +[2023-10-13 14:30:23 simmim_pretrain](main_simmim.py 218): INFO Train: [175/200][4000/6787] eta 0:11:41 lr 0.000200 time 0.2519 (0.2515) loss 0.3519 (0.3559) grad_norm 426892.1875 (inf) mem 14543MB +[2023-10-13 14:32:28 simmim_pretrain](main_simmim.py 218): INFO Train: [175/200][4500/6787] eta 0:09:35 lr 0.000200 time 0.2547 (0.2515) loss 0.3643 (0.3559) grad_norm 493198.4688 (inf) mem 14543MB +[2023-10-13 14:34:33 simmim_pretrain](main_simmim.py 218): INFO Train: [175/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2453 (0.2513) loss 0.3492 (0.3560) grad_norm inf (inf) mem 14543MB +[2023-10-13 14:36:39 simmim_pretrain](main_simmim.py 218): INFO Train: [175/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2559 (0.2512) loss 0.3482 (0.3561) grad_norm 364682.7812 (inf) mem 14543MB +[2023-10-13 14:38:44 simmim_pretrain](main_simmim.py 218): INFO Train: [175/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2500 (0.2512) loss 0.3774 (0.3563) grad_norm 121211.3516 (inf) mem 14543MB +[2023-10-13 14:40:49 simmim_pretrain](main_simmim.py 218): INFO Train: [175/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2459 (0.2511) loss 0.3731 (0.3564) grad_norm 352697.5938 (inf) mem 14543MB +[2023-10-13 14:42:02 simmim_pretrain](main_simmim.py 228): INFO EPOCH 175 training takes 0:28:24 +[2023-10-13 14:42:03 simmim_pretrain](main_simmim.py 218): INFO Train: [176/200][0/6787] eta 2:32:37 lr 0.000200 time 1.3492 (1.3492) loss 0.3504 (0.3504) grad_norm 304569.6875 (304569.6875) mem 14543MB +[2023-10-13 14:44:08 simmim_pretrain](main_simmim.py 218): INFO Train: [176/200][500/6787] eta 0:26:32 lr 0.000200 time 0.2528 (0.2533) loss 0.3783 (0.3570) grad_norm 360711.3438 (262258.7812) mem 14543MB +[2023-10-13 14:46:14 simmim_pretrain](main_simmim.py 218): INFO Train: [176/200][1000/6787] eta 0:24:21 lr 0.000200 time 0.2512 (0.2525) loss 0.3344 (0.3567) grad_norm 382950.3438 (284186.5312) mem 14543MB +[2023-10-13 14:48:20 simmim_pretrain](main_simmim.py 218): INFO Train: [176/200][1500/6787] eta 0:22:14 lr 0.000200 time 0.2498 (0.2524) loss 0.3922 (0.3564) grad_norm 420884.3125 (313460.2500) mem 14543MB +[2023-10-13 14:50:27 simmim_pretrain](main_simmim.py 218): INFO Train: [176/200][2000/6787] eta 0:20:08 lr 0.000200 time 0.2500 (0.2524) loss 0.3466 (0.3562) grad_norm 569204.1250 (332326.0625) mem 14543MB +[2023-10-13 14:52:33 simmim_pretrain](main_simmim.py 218): INFO Train: [176/200][2500/6787] eta 0:18:02 lr 0.000200 time 0.2526 (0.2525) loss 0.3668 (0.3560) grad_norm 227858.9219 (inf) mem 14543MB +[2023-10-13 14:54:39 simmim_pretrain](main_simmim.py 218): INFO Train: [176/200][3000/6787] eta 0:15:56 lr 0.000200 time 0.2592 (0.2526) loss 0.3587 (0.3562) grad_norm 192857.4531 (inf) mem 14543MB +[2023-10-13 14:56:46 simmim_pretrain](main_simmim.py 218): INFO Train: [176/200][3500/6787] eta 0:13:50 lr 0.000200 time 0.2503 (0.2526) loss 0.3633 (0.3564) grad_norm 145429.2656 (inf) mem 14543MB +[2023-10-13 14:58:52 simmim_pretrain](main_simmim.py 218): INFO Train: [176/200][4000/6787] eta 0:11:43 lr 0.000200 time 0.2474 (0.2525) loss 0.3313 (0.3570) grad_norm 165360.3281 (inf) mem 14543MB +[2023-10-13 15:00:58 simmim_pretrain](main_simmim.py 218): INFO Train: [176/200][4500/6787] eta 0:09:37 lr 0.000200 time 0.2533 (0.2525) loss 0.3604 (0.3574) grad_norm 115199.1484 (inf) mem 14543MB +[2023-10-13 15:03:04 simmim_pretrain](main_simmim.py 218): INFO Train: [176/200][5000/6787] eta 0:07:30 lr 0.000200 time 0.2517 (0.2524) loss 0.3681 (0.3576) grad_norm 133063.5156 (inf) mem 14543MB +[2023-10-13 15:05:09 simmim_pretrain](main_simmim.py 218): INFO Train: [176/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2527 (0.2522) loss 0.3429 (0.3579) grad_norm 154319.3750 (inf) mem 14543MB +[2023-10-13 15:07:14 simmim_pretrain](main_simmim.py 218): INFO Train: [176/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2466 (0.2521) loss 0.3500 (0.3578) grad_norm 129154.7734 (inf) mem 14543MB +[2023-10-13 15:09:20 simmim_pretrain](main_simmim.py 218): INFO Train: [176/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2475 (0.2520) loss 0.3682 (0.3578) grad_norm 121991.1094 (inf) mem 14543MB +[2023-10-13 15:10:32 simmim_pretrain](main_simmim.py 228): INFO EPOCH 176 training takes 0:28:30 +[2023-10-13 15:10:34 simmim_pretrain](main_simmim.py 218): INFO Train: [177/200][0/6787] eta 3:05:32 lr 0.000200 time 1.6402 (1.6402) loss 0.3702 (0.3702) grad_norm 82397.2344 (82397.2344) mem 14543MB +[2023-10-13 15:12:39 simmim_pretrain](main_simmim.py 218): INFO Train: [177/200][500/6787] eta 0:26:30 lr 0.000200 time 0.2522 (0.2530) loss 0.3670 (0.3589) grad_norm 90639.8359 (131379.0312) mem 14543MB +[2023-10-13 15:14:44 simmim_pretrain](main_simmim.py 218): INFO Train: [177/200][1000/6787] eta 0:24:15 lr 0.000200 time 0.2533 (0.2515) loss 0.3399 (0.3588) grad_norm 138804.9375 (128666.5234) mem 14543MB +[2023-10-13 15:16:49 simmim_pretrain](main_simmim.py 218): INFO Train: [177/200][1500/6787] eta 0:22:07 lr 0.000200 time 0.2507 (0.2511) loss 0.3772 (0.3593) grad_norm 126654.2969 (125652.7578) mem 14543MB +[2023-10-13 15:18:54 simmim_pretrain](main_simmim.py 218): INFO Train: [177/200][2000/6787] eta 0:20:01 lr 0.000200 time 0.2518 (0.2509) loss 0.3604 (0.3591) grad_norm 160520.8906 (125847.0234) mem 14543MB +[2023-10-13 15:21:00 simmim_pretrain](main_simmim.py 218): INFO Train: [177/200][2500/6787] eta 0:17:55 lr 0.000200 time 0.2491 (0.2509) loss 0.3669 (0.3587) grad_norm 126495.6406 (132592.0469) mem 14543MB +[2023-10-13 15:23:05 simmim_pretrain](main_simmim.py 218): INFO Train: [177/200][3000/6787] eta 0:15:50 lr 0.000200 time 0.2454 (0.2509) loss 0.3558 (0.3586) grad_norm 228613.7656 (140080.8125) mem 14543MB +[2023-10-13 15:25:11 simmim_pretrain](main_simmim.py 218): INFO Train: [177/200][3500/6787] eta 0:13:45 lr 0.000200 time 0.2538 (0.2510) loss 0.3351 (0.3584) grad_norm 133409.6562 (148887.7500) mem 14543MB +[2023-10-13 15:27:17 simmim_pretrain](main_simmim.py 218): INFO Train: [177/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2525 (0.2511) loss 0.3366 (0.3582) grad_norm 288540.2188 (157983.7500) mem 14543MB +[2023-10-13 15:29:23 simmim_pretrain](main_simmim.py 218): INFO Train: [177/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2487 (0.2512) loss 0.3534 (0.3580) grad_norm 291285.2812 (171557.6875) mem 14543MB +[2023-10-13 15:31:29 simmim_pretrain](main_simmim.py 218): INFO Train: [177/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2533 (0.2513) loss 0.3296 (0.3577) grad_norm 228369.9531 (187558.5938) mem 14543MB +[2023-10-13 15:33:35 simmim_pretrain](main_simmim.py 218): INFO Train: [177/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2543 (0.2514) loss 0.3602 (0.3576) grad_norm 651642.6875 (202574.6719) mem 14543MB +[2023-10-13 15:35:42 simmim_pretrain](main_simmim.py 218): INFO Train: [177/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2459 (0.2515) loss 0.3505 (0.3576) grad_norm 212421.2031 (nan) mem 14543MB +[2023-10-13 15:37:48 simmim_pretrain](main_simmim.py 218): INFO Train: [177/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2575 (0.2517) loss 0.3672 (0.3576) grad_norm 225284.5156 (nan) mem 14543MB +[2023-10-13 15:39:01 simmim_pretrain](main_simmim.py 228): INFO EPOCH 177 training takes 0:28:29 +[2023-10-13 15:39:03 simmim_pretrain](main_simmim.py 218): INFO Train: [178/200][0/6787] eta 2:43:36 lr 0.000200 time 1.4463 (1.4463) loss 0.3479 (0.3479) grad_norm 279496.1875 (279496.1875) mem 14543MB +[2023-10-13 15:41:09 simmim_pretrain](main_simmim.py 218): INFO Train: [178/200][500/6787] eta 0:26:39 lr 0.000200 time 0.2540 (0.2545) loss 0.3604 (0.3572) grad_norm 155986.1250 (inf) mem 14543MB +[2023-10-13 15:43:15 simmim_pretrain](main_simmim.py 218): INFO Train: [178/200][1000/6787] eta 0:24:27 lr 0.000200 time 0.2511 (0.2536) loss 0.3650 (0.3588) grad_norm 152570.0625 (inf) mem 14543MB +[2023-10-13 15:45:21 simmim_pretrain](main_simmim.py 218): INFO Train: [178/200][1500/6787] eta 0:22:18 lr 0.000200 time 0.2516 (0.2532) loss 0.3484 (0.3591) grad_norm 122526.7188 (inf) mem 14543MB +[2023-10-13 15:47:28 simmim_pretrain](main_simmim.py 218): INFO Train: [178/200][2000/6787] eta 0:20:10 lr 0.000200 time 0.2534 (0.2530) loss 0.3494 (0.3595) grad_norm 193233.1562 (inf) mem 14543MB +[2023-10-13 15:49:34 simmim_pretrain](main_simmim.py 218): INFO Train: [178/200][2500/6787] eta 0:18:03 lr 0.000200 time 0.2505 (0.2528) loss 0.3541 (0.3594) grad_norm 93997.1406 (inf) mem 14543MB +[2023-10-13 15:51:39 simmim_pretrain](main_simmim.py 218): INFO Train: [178/200][3000/6787] eta 0:15:56 lr 0.000200 time 0.2591 (0.2525) loss 0.3529 (0.3591) grad_norm 165179.5156 (inf) mem 14543MB +[2023-10-13 15:53:45 simmim_pretrain](main_simmim.py 218): INFO Train: [178/200][3500/6787] eta 0:13:49 lr 0.000200 time 0.2499 (0.2523) loss 0.3481 (0.3588) grad_norm 151640.6250 (inf) mem 14543MB +[2023-10-13 15:55:50 simmim_pretrain](main_simmim.py 218): INFO Train: [178/200][4000/6787] eta 0:11:42 lr 0.000200 time 0.2534 (0.2522) loss 0.3503 (0.3587) grad_norm 175264.9531 (inf) mem 14543MB +[2023-10-13 15:57:55 simmim_pretrain](main_simmim.py 218): INFO Train: [178/200][4500/6787] eta 0:09:36 lr 0.000200 time 0.2506 (0.2520) loss 0.3480 (0.3584) grad_norm 144900.8438 (inf) mem 14543MB +[2023-10-13 16:00:01 simmim_pretrain](main_simmim.py 218): INFO Train: [178/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2589 (0.2518) loss 0.3573 (0.3582) grad_norm 357703.0938 (inf) mem 14543MB +[2023-10-13 16:02:06 simmim_pretrain](main_simmim.py 218): INFO Train: [178/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2502 (0.2516) loss 0.3635 (0.3581) grad_norm 372290.4375 (inf) mem 14543MB +[2023-10-13 16:04:11 simmim_pretrain](main_simmim.py 218): INFO Train: [178/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2516 (0.2515) loss 0.3635 (0.3580) grad_norm 212023.1250 (inf) mem 14543MB +[2023-10-13 16:06:16 simmim_pretrain](main_simmim.py 218): INFO Train: [178/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2512 (0.2514) loss 0.3423 (0.3578) grad_norm 201310.3125 (inf) mem 14543MB +[2023-10-13 16:07:28 simmim_pretrain](main_simmim.py 228): INFO EPOCH 178 training takes 0:28:26 +[2023-10-13 16:07:30 simmim_pretrain](main_simmim.py 218): INFO Train: [179/200][0/6787] eta 2:58:19 lr 0.000200 time 1.5764 (1.5764) loss 0.3674 (0.3674) grad_norm 436660.1250 (436660.1250) mem 14543MB +[2023-10-13 16:09:35 simmim_pretrain](main_simmim.py 218): INFO Train: [179/200][500/6787] eta 0:26:30 lr 0.000200 time 0.2597 (0.2529) loss 0.3590 (0.3567) grad_norm 201875.3750 (inf) mem 14543MB +[2023-10-13 16:11:41 simmim_pretrain](main_simmim.py 218): INFO Train: [179/200][1000/6787] eta 0:24:20 lr 0.000200 time 0.2556 (0.2524) loss 0.3547 (0.3589) grad_norm 98453.1562 (inf) mem 14543MB +[2023-10-13 16:13:47 simmim_pretrain](main_simmim.py 218): INFO Train: [179/200][1500/6787] eta 0:22:15 lr 0.000200 time 0.2508 (0.2526) loss 0.3435 (0.3592) grad_norm 137136.0781 (inf) mem 14543MB +[2023-10-13 16:15:54 simmim_pretrain](main_simmim.py 218): INFO Train: [179/200][2000/6787] eta 0:20:10 lr 0.000200 time 0.2553 (0.2528) loss 0.3464 (0.3596) grad_norm 145842.3281 (inf) mem 14543MB +[2023-10-13 16:18:01 simmim_pretrain](main_simmim.py 218): INFO Train: [179/200][2500/6787] eta 0:18:04 lr 0.000200 time 0.2555 (0.2531) loss 0.3649 (0.3596) grad_norm 113860.8125 (inf) mem 14543MB +[2023-10-13 16:20:09 simmim_pretrain](main_simmim.py 218): INFO Train: [179/200][3000/6787] eta 0:16:00 lr 0.000200 time 0.2594 (0.2537) loss 0.3439 (0.3594) grad_norm 167252.1250 (inf) mem 14543MB +[2023-10-13 16:22:19 simmim_pretrain](main_simmim.py 218): INFO Train: [179/200][3500/6787] eta 0:13:56 lr 0.000200 time 0.2593 (0.2544) loss 0.3714 (0.3590) grad_norm 203946.1875 (inf) mem 14543MB +[2023-10-13 16:24:29 simmim_pretrain](main_simmim.py 218): INFO Train: [179/200][4000/6787] eta 0:11:50 lr 0.000200 time 0.2592 (0.2550) loss 0.3609 (0.3589) grad_norm 105017.1562 (inf) mem 14543MB +[2023-10-13 16:26:37 simmim_pretrain](main_simmim.py 218): INFO Train: [179/200][4500/6787] eta 0:09:43 lr 0.000200 time 0.2579 (0.2552) loss 0.3569 (0.3589) grad_norm 96443.5859 (inf) mem 14543MB +[2023-10-13 16:28:46 simmim_pretrain](main_simmim.py 218): INFO Train: [179/200][5000/6787] eta 0:07:36 lr 0.000200 time 0.2592 (0.2554) loss 0.3526 (0.3590) grad_norm 116598.6172 (inf) mem 14543MB +[2023-10-13 16:30:53 simmim_pretrain](main_simmim.py 218): INFO Train: [179/200][5500/6787] eta 0:05:28 lr 0.000200 time 0.2513 (0.2554) loss 0.3581 (0.3591) grad_norm 159960.1094 (inf) mem 14543MB +[2023-10-13 16:33:01 simmim_pretrain](main_simmim.py 218): INFO Train: [179/200][6000/6787] eta 0:03:20 lr 0.000200 time 0.2563 (0.2554) loss 0.3503 (0.3591) grad_norm 243081.6250 (inf) mem 14543MB +[2023-10-13 16:35:08 simmim_pretrain](main_simmim.py 218): INFO Train: [179/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2539 (0.2554) loss 0.3621 (0.3589) grad_norm 121159.7344 (inf) mem 14543MB +[2023-10-13 16:36:22 simmim_pretrain](main_simmim.py 228): INFO EPOCH 179 training takes 0:28:53 +[2023-10-13 16:36:24 simmim_pretrain](main_simmim.py 218): INFO Train: [180/200][0/6787] eta 2:44:49 lr 0.000200 time 1.4571 (1.4571) loss 0.3565 (0.3565) grad_norm 215607.2500 (215607.2500) mem 14543MB +[2023-10-13 16:38:30 simmim_pretrain](main_simmim.py 218): INFO Train: [180/200][500/6787] eta 0:26:39 lr 0.000200 time 0.2466 (0.2544) loss 0.3521 (0.3571) grad_norm 224558.2656 (191749.0156) mem 14543MB +[2023-10-13 16:40:35 simmim_pretrain](main_simmim.py 218): INFO Train: [180/200][1000/6787] eta 0:24:23 lr 0.000200 time 0.2535 (0.2530) loss 0.3722 (0.3574) grad_norm 174706.5625 (195154.1250) mem 14543MB +[2023-10-13 16:42:41 simmim_pretrain](main_simmim.py 218): INFO Train: [180/200][1500/6787] eta 0:22:15 lr 0.000200 time 0.2489 (0.2525) loss 0.3626 (0.3570) grad_norm 398399.4688 (225161.1875) mem 14543MB +[2023-10-13 16:44:47 simmim_pretrain](main_simmim.py 218): INFO Train: [180/200][2000/6787] eta 0:20:07 lr 0.000200 time 0.2456 (0.2523) loss 0.3714 (0.3567) grad_norm 243426.7344 (inf) mem 14543MB +[2023-10-13 16:46:53 simmim_pretrain](main_simmim.py 218): INFO Train: [180/200][2500/6787] eta 0:18:00 lr 0.000200 time 0.2519 (0.2522) loss 0.3839 (0.3569) grad_norm 110051.3906 (nan) mem 14543MB +[2023-10-13 16:48:58 simmim_pretrain](main_simmim.py 218): INFO Train: [180/200][3000/6787] eta 0:15:54 lr 0.000200 time 0.2516 (0.2520) loss 0.3560 (0.3574) grad_norm 127171.4297 (nan) mem 14543MB +[2023-10-13 16:51:04 simmim_pretrain](main_simmim.py 218): INFO Train: [180/200][3500/6787] eta 0:13:48 lr 0.000200 time 0.2512 (0.2520) loss 0.3501 (0.3576) grad_norm 132613.2344 (nan) mem 14543MB +[2023-10-13 16:53:10 simmim_pretrain](main_simmim.py 218): INFO Train: [180/200][4000/6787] eta 0:11:41 lr 0.000200 time 0.2511 (0.2519) loss 0.3740 (0.3582) grad_norm 72408.9844 (nan) mem 14543MB +[2023-10-13 16:55:15 simmim_pretrain](main_simmim.py 218): INFO Train: [180/200][4500/6787] eta 0:09:35 lr 0.000200 time 0.2537 (0.2518) loss 0.3361 (0.3585) grad_norm 71960.0703 (nan) mem 14543MB +[2023-10-13 16:57:21 simmim_pretrain](main_simmim.py 218): INFO Train: [180/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2489 (0.2517) loss 0.3537 (0.3589) grad_norm 60139.3828 (nan) mem 14543MB +[2023-10-13 16:59:27 simmim_pretrain](main_simmim.py 218): INFO Train: [180/200][5500/6787] eta 0:05:23 lr 0.000200 time 0.2598 (0.2517) loss 0.3631 (0.3592) grad_norm 79601.4609 (nan) mem 14543MB +[2023-10-13 17:01:33 simmim_pretrain](main_simmim.py 218): INFO Train: [180/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2582 (0.2518) loss 0.3625 (0.3593) grad_norm 68794.8203 (nan) mem 14543MB +[2023-10-13 17:03:39 simmim_pretrain](main_simmim.py 218): INFO Train: [180/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2493 (0.2518) loss 0.3575 (0.3594) grad_norm 85724.7969 (nan) mem 14543MB +[2023-10-13 17:04:52 simmim_pretrain](main_simmim.py 228): INFO EPOCH 180 training takes 0:28:29 +[2023-10-13 17:04:52 simmim_pretrain](utils.py 62): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_180.pth saving...... +[2023-10-13 17:04:53 simmim_pretrain](utils.py 64): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_180.pth saved !!! +[2023-10-13 17:04:54 simmim_pretrain](main_simmim.py 218): INFO Train: [181/200][0/6787] eta 2:26:40 lr 0.000200 time 1.2967 (1.2967) loss 0.3552 (0.3552) grad_norm 85865.9219 (85865.9219) mem 14543MB +[2023-10-13 17:07:00 simmim_pretrain](main_simmim.py 218): INFO Train: [181/200][500/6787] eta 0:26:39 lr 0.000200 time 0.2579 (0.2544) loss 0.3643 (0.3587) grad_norm 38902.9453 (92803.9766) mem 14543MB +[2023-10-13 17:09:06 simmim_pretrain](main_simmim.py 218): INFO Train: [181/200][1000/6787] eta 0:24:28 lr 0.000200 time 0.2483 (0.2537) loss 0.3367 (0.3582) grad_norm 112823.5703 (94805.3281) mem 14543MB +[2023-10-13 17:11:13 simmim_pretrain](main_simmim.py 218): INFO Train: [181/200][1500/6787] eta 0:22:19 lr 0.000200 time 0.2459 (0.2534) loss 0.3346 (0.3579) grad_norm 69781.5000 (104531.6172) mem 14543MB +[2023-10-13 17:13:20 simmim_pretrain](main_simmim.py 218): INFO Train: [181/200][2000/6787] eta 0:20:13 lr 0.000200 time 0.2480 (0.2535) loss 0.3413 (0.3576) grad_norm 124624.3047 (114849.0859) mem 14543MB +[2023-10-13 17:15:26 simmim_pretrain](main_simmim.py 218): INFO Train: [181/200][2500/6787] eta 0:18:06 lr 0.000200 time 0.2531 (0.2534) loss 0.3699 (0.3575) grad_norm 99668.1641 (124477.9922) mem 14543MB +[2023-10-13 17:17:33 simmim_pretrain](main_simmim.py 218): INFO Train: [181/200][3000/6787] eta 0:15:59 lr 0.000200 time 0.2536 (0.2533) loss 0.3473 (0.3574) grad_norm 172290.0312 (132671.6562) mem 14543MB +[2023-10-13 17:19:39 simmim_pretrain](main_simmim.py 218): INFO Train: [181/200][3500/6787] eta 0:13:52 lr 0.000200 time 0.2490 (0.2532) loss 0.3615 (0.3573) grad_norm 206155.5312 (147419.6562) mem 14543MB +[2023-10-13 17:21:45 simmim_pretrain](main_simmim.py 218): INFO Train: [181/200][4000/6787] eta 0:11:45 lr 0.000200 time 0.2503 (0.2531) loss 0.3517 (0.3570) grad_norm 181384.9688 (162562.0469) mem 14543MB +[2023-10-13 17:23:51 simmim_pretrain](main_simmim.py 218): INFO Train: [181/200][4500/6787] eta 0:09:38 lr 0.000200 time 0.2487 (0.2530) loss 0.3595 (0.3569) grad_norm 255865.3594 (185355.2969) mem 14543MB +[2023-10-13 17:25:58 simmim_pretrain](main_simmim.py 218): INFO Train: [181/200][5000/6787] eta 0:07:32 lr 0.000200 time 0.2494 (0.2530) loss 0.3735 (0.3568) grad_norm 183144.1250 (199658.5625) mem 14543MB +[2023-10-13 17:28:04 simmim_pretrain](main_simmim.py 218): INFO Train: [181/200][5500/6787] eta 0:05:25 lr 0.000200 time 0.2489 (0.2529) loss 0.3832 (0.3566) grad_norm 154694.3125 (inf) mem 14543MB +[2023-10-13 17:30:10 simmim_pretrain](main_simmim.py 218): INFO Train: [181/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2522 (0.2528) loss 0.3663 (0.3566) grad_norm 250510.2344 (inf) mem 14543MB +[2023-10-13 17:32:16 simmim_pretrain](main_simmim.py 218): INFO Train: [181/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2483 (0.2528) loss 0.3569 (0.3566) grad_norm 243042.2656 (inf) mem 14543MB +[2023-10-13 17:33:29 simmim_pretrain](main_simmim.py 228): INFO EPOCH 181 training takes 0:28:36 +[2023-10-13 17:33:30 simmim_pretrain](main_simmim.py 218): INFO Train: [182/200][0/6787] eta 2:48:47 lr 0.000200 time 1.4922 (1.4922) loss 0.3684 (0.3684) grad_norm 253017.4688 (253017.4688) mem 14543MB +[2023-10-13 17:35:35 simmim_pretrain](main_simmim.py 218): INFO Train: [182/200][500/6787] eta 0:26:29 lr 0.000200 time 0.2519 (0.2528) loss 0.3504 (0.3578) grad_norm 100166.4844 (inf) mem 14543MB +[2023-10-13 17:37:41 simmim_pretrain](main_simmim.py 218): INFO Train: [182/200][1000/6787] eta 0:24:17 lr 0.000200 time 0.2590 (0.2518) loss 0.3641 (0.3583) grad_norm 118435.4609 (inf) mem 14543MB +[2023-10-13 17:39:46 simmim_pretrain](main_simmim.py 218): INFO Train: [182/200][1500/6787] eta 0:22:10 lr 0.000200 time 0.2507 (0.2516) loss 0.3551 (0.3585) grad_norm 69517.2969 (inf) mem 14543MB +[2023-10-13 17:41:52 simmim_pretrain](main_simmim.py 218): INFO Train: [182/200][2000/6787] eta 0:20:04 lr 0.000200 time 0.2483 (0.2516) loss 0.3468 (0.3585) grad_norm 118994.6797 (inf) mem 14543MB +[2023-10-13 17:43:58 simmim_pretrain](main_simmim.py 218): INFO Train: [182/200][2500/6787] eta 0:17:59 lr 0.000200 time 0.2577 (0.2517) loss 0.3287 (0.3585) grad_norm 81066.2109 (inf) mem 14543MB +[2023-10-13 17:46:04 simmim_pretrain](main_simmim.py 218): INFO Train: [182/200][3000/6787] eta 0:15:53 lr 0.000200 time 0.2530 (0.2518) loss 0.3565 (0.3583) grad_norm 201463.7031 (inf) mem 14543MB +[2023-10-13 17:48:11 simmim_pretrain](main_simmim.py 218): INFO Train: [182/200][3500/6787] eta 0:13:48 lr 0.000200 time 0.2602 (0.2519) loss 0.3482 (0.3581) grad_norm 181093.2031 (inf) mem 14543MB +[2023-10-13 17:50:17 simmim_pretrain](main_simmim.py 218): INFO Train: [182/200][4000/6787] eta 0:11:42 lr 0.000200 time 0.2586 (0.2521) loss 0.3625 (0.3579) grad_norm 151281.2969 (inf) mem 14543MB +[2023-10-13 17:52:24 simmim_pretrain](main_simmim.py 218): INFO Train: [182/200][4500/6787] eta 0:09:37 lr 0.000200 time 0.2606 (0.2523) loss 0.3597 (0.3577) grad_norm 141815.9062 (inf) mem 14543MB +[2023-10-13 17:54:34 simmim_pretrain](main_simmim.py 218): INFO Train: [182/200][5000/6787] eta 0:07:32 lr 0.000200 time 0.2610 (0.2531) loss 0.3373 (0.3579) grad_norm 139475.9688 (inf) mem 14543MB +[2023-10-13 17:56:44 simmim_pretrain](main_simmim.py 218): INFO Train: [182/200][5500/6787] eta 0:05:26 lr 0.000200 time 0.2606 (0.2538) loss 0.3451 (0.3580) grad_norm 129226.3750 (inf) mem 14543MB +[2023-10-13 17:58:55 simmim_pretrain](main_simmim.py 218): INFO Train: [182/200][6000/6787] eta 0:03:20 lr 0.000200 time 0.2608 (0.2543) loss 0.3433 (0.3582) grad_norm 115159.5625 (inf) mem 14543MB +[2023-10-13 18:01:05 simmim_pretrain](main_simmim.py 218): INFO Train: [182/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2609 (0.2548) loss 0.3533 (0.3582) grad_norm 80655.1641 (inf) mem 14543MB +[2023-10-13 18:02:20 simmim_pretrain](main_simmim.py 228): INFO EPOCH 182 training takes 0:28:51 +[2023-10-13 18:02:21 simmim_pretrain](main_simmim.py 218): INFO Train: [183/200][0/6787] eta 2:45:14 lr 0.000200 time 1.4609 (1.4609) loss 0.3515 (0.3515) grad_norm 76554.4844 (76554.4844) mem 14543MB +[2023-10-13 18:04:27 simmim_pretrain](main_simmim.py 218): INFO Train: [183/200][500/6787] eta 0:26:39 lr 0.000200 time 0.2476 (0.2544) loss 0.3591 (0.3564) grad_norm 127096.1562 (170891.0000) mem 14543MB +[2023-10-13 18:06:33 simmim_pretrain](main_simmim.py 218): INFO Train: [183/200][1000/6787] eta 0:24:26 lr 0.000200 time 0.2491 (0.2533) loss 0.3560 (0.3564) grad_norm 77557.0859 (165109.9219) mem 14543MB +[2023-10-13 18:08:39 simmim_pretrain](main_simmim.py 218): INFO Train: [183/200][1500/6787] eta 0:22:17 lr 0.000200 time 0.2463 (0.2529) loss 0.3723 (0.3568) grad_norm 413376.1875 (171186.3438) mem 14543MB +[2023-10-13 18:10:45 simmim_pretrain](main_simmim.py 218): INFO Train: [183/200][2000/6787] eta 0:20:09 lr 0.000200 time 0.2519 (0.2527) loss 0.3581 (0.3566) grad_norm 137128.2500 (190310.2969) mem 14543MB +[2023-10-13 18:12:52 simmim_pretrain](main_simmim.py 218): INFO Train: [183/200][2500/6787] eta 0:18:02 lr 0.000200 time 0.2532 (0.2526) loss 0.3703 (0.3567) grad_norm 317263.0625 (210609.0156) mem 14543MB +[2023-10-13 18:14:58 simmim_pretrain](main_simmim.py 218): INFO Train: [183/200][3000/6787] eta 0:15:56 lr 0.000200 time 0.2541 (0.2526) loss 0.3459 (0.3565) grad_norm 251966.2344 (227942.5625) mem 14543MB +[2023-10-13 18:17:04 simmim_pretrain](main_simmim.py 218): INFO Train: [183/200][3500/6787] eta 0:13:50 lr 0.000200 time 0.2498 (0.2526) loss 0.3682 (0.3564) grad_norm 275606.4375 (240739.4219) mem 14543MB +[2023-10-13 18:19:11 simmim_pretrain](main_simmim.py 218): INFO Train: [183/200][4000/6787] eta 0:11:44 lr 0.000200 time 0.2523 (0.2526) loss 0.3632 (0.3564) grad_norm 270686.3750 (inf) mem 14543MB +[2023-10-13 18:21:17 simmim_pretrain](main_simmim.py 218): INFO Train: [183/200][4500/6787] eta 0:09:37 lr 0.000200 time 0.2482 (0.2526) loss 0.3367 (0.3565) grad_norm 188962.1719 (inf) mem 14543MB +[2023-10-13 18:23:24 simmim_pretrain](main_simmim.py 218): INFO Train: [183/200][5000/6787] eta 0:07:31 lr 0.000200 time 0.2581 (0.2527) loss 0.3610 (0.3566) grad_norm 311665.9062 (inf) mem 14543MB +[2023-10-13 18:25:32 simmim_pretrain](main_simmim.py 218): INFO Train: [183/200][5500/6787] eta 0:05:25 lr 0.000200 time 0.2598 (0.2531) loss 0.3751 (0.3566) grad_norm 206069.2656 (inf) mem 14543MB +[2023-10-13 18:27:42 simmim_pretrain](main_simmim.py 218): INFO Train: [183/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2599 (0.2537) loss 0.3391 (0.3564) grad_norm 147205.3906 (inf) mem 14543MB +[2023-10-13 18:29:52 simmim_pretrain](main_simmim.py 218): INFO Train: [183/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2534 (0.2541) loss 0.3746 (0.3564) grad_norm 219439.5156 (inf) mem 14543MB +[2023-10-13 18:31:06 simmim_pretrain](main_simmim.py 228): INFO EPOCH 183 training takes 0:28:45 +[2023-10-13 18:31:07 simmim_pretrain](main_simmim.py 218): INFO Train: [184/200][0/6787] eta 2:57:50 lr 0.000200 time 1.5722 (1.5722) loss 0.3643 (0.3643) grad_norm 376927.5312 (376927.5312) mem 14543MB +[2023-10-13 18:33:15 simmim_pretrain](main_simmim.py 218): INFO Train: [184/200][500/6787] eta 0:26:57 lr 0.000200 time 0.2524 (0.2572) loss 0.3228 (0.3554) grad_norm 454106.4375 (338742.6875) mem 14543MB +[2023-10-13 18:35:22 simmim_pretrain](main_simmim.py 218): INFO Train: [184/200][1000/6787] eta 0:24:41 lr 0.000200 time 0.2519 (0.2560) loss 0.3319 (0.3552) grad_norm 493399.6562 (inf) mem 14543MB +[2023-10-13 18:37:29 simmim_pretrain](main_simmim.py 218): INFO Train: [184/200][1500/6787] eta 0:22:29 lr 0.000200 time 0.2501 (0.2552) loss 0.3598 (0.3552) grad_norm 493181.5312 (inf) mem 14543MB +[2023-10-13 18:39:35 simmim_pretrain](main_simmim.py 218): INFO Train: [184/200][2000/6787] eta 0:20:18 lr 0.000200 time 0.2550 (0.2546) loss 0.3548 (0.3553) grad_norm 278771.2500 (inf) mem 14543MB +[2023-10-13 18:41:41 simmim_pretrain](main_simmim.py 218): INFO Train: [184/200][2500/6787] eta 0:18:09 lr 0.000200 time 0.2502 (0.2542) loss 0.3281 (0.3553) grad_norm 474966.7500 (inf) mem 14543MB +[2023-10-13 18:43:48 simmim_pretrain](main_simmim.py 218): INFO Train: [184/200][3000/6787] eta 0:16:01 lr 0.000200 time 0.2477 (0.2539) loss 0.3494 (0.3552) grad_norm 312082.7500 (inf) mem 14543MB +[2023-10-13 18:45:53 simmim_pretrain](main_simmim.py 218): INFO Train: [184/200][3500/6787] eta 0:13:53 lr 0.000200 time 0.2534 (0.2536) loss 0.3691 (0.3552) grad_norm 336733.9062 (inf) mem 14543MB +[2023-10-13 18:47:59 simmim_pretrain](main_simmim.py 218): INFO Train: [184/200][4000/6787] eta 0:11:45 lr 0.000200 time 0.2592 (0.2533) loss 0.3524 (0.3551) grad_norm 587151.1875 (inf) mem 14543MB +[2023-10-13 18:50:05 simmim_pretrain](main_simmim.py 218): INFO Train: [184/200][4500/6787] eta 0:09:38 lr 0.000200 time 0.2464 (0.2531) loss 0.3453 (0.3551) grad_norm 263619.8125 (inf) mem 14543MB +[2023-10-13 18:52:10 simmim_pretrain](main_simmim.py 218): INFO Train: [184/200][5000/6787] eta 0:07:31 lr 0.000200 time 0.2460 (0.2529) loss 0.3554 (0.3552) grad_norm 292374.7188 (inf) mem 14543MB +[2023-10-13 18:54:16 simmim_pretrain](main_simmim.py 218): INFO Train: [184/200][5500/6787] eta 0:05:25 lr 0.000200 time 0.2479 (0.2527) loss 0.3508 (0.3554) grad_norm 144429.1719 (inf) mem 14543MB +[2023-10-13 18:56:21 simmim_pretrain](main_simmim.py 218): INFO Train: [184/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2488 (0.2526) loss 0.3663 (0.3555) grad_norm 207911.6562 (inf) mem 14543MB +[2023-10-13 18:58:27 simmim_pretrain](main_simmim.py 218): INFO Train: [184/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2539 (0.2524) loss 0.3455 (0.3557) grad_norm 156165.5156 (inf) mem 14543MB +[2023-10-13 18:59:39 simmim_pretrain](main_simmim.py 228): INFO EPOCH 184 training takes 0:28:33 +[2023-10-13 18:59:41 simmim_pretrain](main_simmim.py 218): INFO Train: [185/200][0/6787] eta 2:50:30 lr 0.000200 time 1.5074 (1.5074) loss 0.3373 (0.3373) grad_norm 234501.8906 (234501.8906) mem 14543MB +[2023-10-13 19:01:47 simmim_pretrain](main_simmim.py 218): INFO Train: [185/200][500/6787] eta 0:26:35 lr 0.000200 time 0.2505 (0.2538) loss 0.3905 (0.3568) grad_norm 157650.0312 (inf) mem 14543MB +[2023-10-13 19:03:52 simmim_pretrain](main_simmim.py 218): INFO Train: [185/200][1000/6787] eta 0:24:22 lr 0.000200 time 0.2461 (0.2528) loss 0.3562 (0.3572) grad_norm 246554.1094 (inf) mem 14543MB +[2023-10-13 19:05:59 simmim_pretrain](main_simmim.py 218): INFO Train: [185/200][1500/6787] eta 0:22:15 lr 0.000200 time 0.2580 (0.2526) loss 0.3528 (0.3569) grad_norm 215366.1875 (inf) mem 14543MB +[2023-10-13 19:08:05 simmim_pretrain](main_simmim.py 218): INFO Train: [185/200][2000/6787] eta 0:20:09 lr 0.000200 time 0.2562 (0.2528) loss 0.3562 (0.3570) grad_norm 181876.3438 (inf) mem 14543MB +[2023-10-13 19:10:12 simmim_pretrain](main_simmim.py 218): INFO Train: [185/200][2500/6787] eta 0:18:03 lr 0.000200 time 0.2589 (0.2528) loss 0.3453 (0.3568) grad_norm 196953.1094 (inf) mem 14543MB +[2023-10-13 19:12:18 simmim_pretrain](main_simmim.py 218): INFO Train: [185/200][3000/6787] eta 0:15:57 lr 0.000200 time 0.2526 (0.2529) loss 0.3461 (0.3567) grad_norm 443918.4375 (inf) mem 14543MB +[2023-10-13 19:14:25 simmim_pretrain](main_simmim.py 218): INFO Train: [185/200][3500/6787] eta 0:13:51 lr 0.000200 time 0.2593 (0.2530) loss 0.3527 (0.3565) grad_norm 255741.5625 (inf) mem 14543MB +[2023-10-13 19:16:32 simmim_pretrain](main_simmim.py 218): INFO Train: [185/200][4000/6787] eta 0:11:45 lr 0.000200 time 0.2513 (0.2531) loss 0.3608 (0.3563) grad_norm 692436.2500 (inf) mem 14543MB +[2023-10-13 19:18:39 simmim_pretrain](main_simmim.py 218): INFO Train: [185/200][4500/6787] eta 0:09:38 lr 0.000200 time 0.2871 (0.2531) loss 0.3488 (0.3561) grad_norm 239431.0156 (inf) mem 14543MB +[2023-10-13 19:20:45 simmim_pretrain](main_simmim.py 218): INFO Train: [185/200][5000/6787] eta 0:07:32 lr 0.000200 time 0.2526 (0.2531) loss 0.3497 (0.3562) grad_norm 250409.8594 (inf) mem 14543MB +[2023-10-13 19:22:52 simmim_pretrain](main_simmim.py 218): INFO Train: [185/200][5500/6787] eta 0:05:25 lr 0.000200 time 0.2541 (0.2531) loss 0.3545 (0.3563) grad_norm 134846.6094 (inf) mem 14543MB +[2023-10-13 19:24:58 simmim_pretrain](main_simmim.py 218): INFO Train: [185/200][6000/6787] eta 0:03:19 lr 0.000200 time 0.2459 (0.2531) loss 0.3626 (0.3566) grad_norm 151372.5625 (inf) mem 14543MB +[2023-10-13 19:27:04 simmim_pretrain](main_simmim.py 218): INFO Train: [185/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2570 (0.2530) loss 0.3617 (0.3568) grad_norm 142897.0625 (inf) mem 14543MB +[2023-10-13 19:28:17 simmim_pretrain](main_simmim.py 228): INFO EPOCH 185 training takes 0:28:37 +[2023-10-13 19:28:19 simmim_pretrain](main_simmim.py 218): INFO Train: [186/200][0/6787] eta 2:55:48 lr 0.000200 time 1.5542 (1.5542) loss 0.3465 (0.3465) grad_norm 69697.7031 (69697.7031) mem 14543MB +[2023-10-13 19:30:24 simmim_pretrain](main_simmim.py 218): INFO Train: [186/200][500/6787] eta 0:26:35 lr 0.000200 time 0.2526 (0.2538) loss 0.3424 (0.3585) grad_norm 106456.5469 (116079.1406) mem 14543MB +[2023-10-13 19:32:30 simmim_pretrain](main_simmim.py 218): INFO Train: [186/200][1000/6787] eta 0:24:23 lr 0.000200 time 0.2537 (0.2528) loss 0.3512 (0.3580) grad_norm 148266.9375 (125810.9453) mem 14543MB +[2023-10-13 19:34:36 simmim_pretrain](main_simmim.py 218): INFO Train: [186/200][1500/6787] eta 0:22:13 lr 0.000200 time 0.2460 (0.2522) loss 0.3681 (0.3579) grad_norm 110984.7188 (138786.0625) mem 14543MB +[2023-10-13 19:36:41 simmim_pretrain](main_simmim.py 218): INFO Train: [186/200][2000/6787] eta 0:20:05 lr 0.000200 time 0.2497 (0.2519) loss 0.3559 (0.3577) grad_norm 202212.3438 (150280.7344) mem 14543MB +[2023-10-13 19:38:47 simmim_pretrain](main_simmim.py 218): INFO Train: [186/200][2500/6787] eta 0:17:59 lr 0.000200 time 0.2515 (0.2517) loss 0.3477 (0.3575) grad_norm 192295.7031 (163308.4375) mem 14543MB +[2023-10-13 19:40:52 simmim_pretrain](main_simmim.py 218): INFO Train: [186/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2458 (0.2516) loss 0.3780 (0.3572) grad_norm 329073.6562 (180958.5625) mem 14543MB +[2023-10-13 19:42:58 simmim_pretrain](main_simmim.py 218): INFO Train: [186/200][3500/6787] eta 0:13:47 lr 0.000200 time 0.2498 (0.2516) loss 0.3551 (0.3570) grad_norm 162216.1719 (199337.0156) mem 14543MB +[2023-10-13 19:45:04 simmim_pretrain](main_simmim.py 218): INFO Train: [186/200][4000/6787] eta 0:11:41 lr 0.000200 time 0.2529 (0.2516) loss 0.3431 (0.3571) grad_norm 204631.0938 (inf) mem 14543MB +[2023-10-13 19:47:10 simmim_pretrain](main_simmim.py 218): INFO Train: [186/200][4500/6787] eta 0:09:35 lr 0.000200 time 0.2521 (0.2517) loss 0.3492 (0.3577) grad_norm 167290.2344 (inf) mem 14543MB +[2023-10-13 19:49:16 simmim_pretrain](main_simmim.py 218): INFO Train: [186/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2511 (0.2518) loss 0.3563 (0.3581) grad_norm 64944.2695 (inf) mem 14543MB +[2023-10-13 19:51:23 simmim_pretrain](main_simmim.py 218): INFO Train: [186/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2513 (0.2519) loss 0.3580 (0.3585) grad_norm 44634.4023 (inf) mem 14543MB +[2023-10-13 19:53:29 simmim_pretrain](main_simmim.py 218): INFO Train: [186/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2527 (0.2520) loss 0.3448 (0.3588) grad_norm 55371.0078 (inf) mem 14543MB +[2023-10-13 19:55:38 simmim_pretrain](main_simmim.py 218): INFO Train: [186/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2582 (0.2524) loss 0.3413 (0.3588) grad_norm 63087.7148 (inf) mem 14543MB +[2023-10-13 19:56:52 simmim_pretrain](main_simmim.py 228): INFO EPOCH 186 training takes 0:28:35 +[2023-10-13 19:56:54 simmim_pretrain](main_simmim.py 218): INFO Train: [187/200][0/6787] eta 2:46:52 lr 0.000200 time 1.4753 (1.4753) loss 0.3656 (0.3656) grad_norm 76636.9141 (76636.9141) mem 14543MB +[2023-10-13 19:59:01 simmim_pretrain](main_simmim.py 218): INFO Train: [187/200][500/6787] eta 0:26:49 lr 0.000200 time 0.2531 (0.2561) loss 0.3592 (0.3586) grad_norm 88129.7109 (82246.4219) mem 14543MB +[2023-10-13 20:01:08 simmim_pretrain](main_simmim.py 218): INFO Train: [187/200][1000/6787] eta 0:24:40 lr 0.000200 time 0.2597 (0.2559) loss 0.3422 (0.3589) grad_norm 49909.4648 (87997.9141) mem 14543MB +[2023-10-13 20:03:18 simmim_pretrain](main_simmim.py 218): INFO Train: [187/200][1500/6787] eta 0:22:38 lr 0.000200 time 0.2594 (0.2570) loss 0.3563 (0.3585) grad_norm 94033.5625 (91824.0156) mem 14543MB +[2023-10-13 20:05:28 simmim_pretrain](main_simmim.py 218): INFO Train: [187/200][2000/6787] eta 0:20:33 lr 0.000200 time 0.2597 (0.2576) loss 0.3535 (0.3582) grad_norm 272624.5625 (105814.9453) mem 14543MB +[2023-10-13 20:07:37 simmim_pretrain](main_simmim.py 218): INFO Train: [187/200][2500/6787] eta 0:18:25 lr 0.000200 time 0.2598 (0.2579) loss 0.3467 (0.3580) grad_norm 93059.9844 (115807.8359) mem 14543MB +[2023-10-13 20:09:47 simmim_pretrain](main_simmim.py 218): INFO Train: [187/200][3000/6787] eta 0:16:17 lr 0.000200 time 0.2595 (0.2581) loss 0.3531 (0.3578) grad_norm 241058.3750 (124323.6406) mem 14543MB +[2023-10-13 20:11:56 simmim_pretrain](main_simmim.py 218): INFO Train: [187/200][3500/6787] eta 0:14:08 lr 0.000200 time 0.2598 (0.2583) loss 0.3767 (0.3577) grad_norm 165201.0469 (133047.0938) mem 14543MB +[2023-10-13 20:14:06 simmim_pretrain](main_simmim.py 218): INFO Train: [187/200][4000/6787] eta 0:12:00 lr 0.000200 time 0.2593 (0.2584) loss 0.3584 (0.3574) grad_norm 199413.0781 (156939.2969) mem 14543MB +[2023-10-13 20:16:15 simmim_pretrain](main_simmim.py 218): INFO Train: [187/200][4500/6787] eta 0:09:50 lr 0.000200 time 0.2562 (0.2584) loss 0.3548 (0.3572) grad_norm 185677.5312 (167241.2500) mem 14543MB +[2023-10-13 20:18:25 simmim_pretrain](main_simmim.py 218): INFO Train: [187/200][5000/6787] eta 0:07:41 lr 0.000200 time 0.2588 (0.2585) loss 0.3616 (0.3570) grad_norm 582537.6875 (183261.5938) mem 14543MB +[2023-10-13 20:20:34 simmim_pretrain](main_simmim.py 218): INFO Train: [187/200][5500/6787] eta 0:05:32 lr 0.000200 time 0.2594 (0.2585) loss 0.3577 (0.3568) grad_norm 305145.8125 (inf) mem 14543MB +[2023-10-13 20:22:43 simmim_pretrain](main_simmim.py 218): INFO Train: [187/200][6000/6787] eta 0:03:23 lr 0.000200 time 0.2545 (0.2585) loss 0.3639 (0.3567) grad_norm 464761.6875 (inf) mem 14543MB +[2023-10-13 20:24:53 simmim_pretrain](main_simmim.py 218): INFO Train: [187/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2593 (0.2585) loss 0.3832 (0.3566) grad_norm 529095.9375 (inf) mem 14543MB +[2023-10-13 20:26:07 simmim_pretrain](main_simmim.py 228): INFO EPOCH 187 training takes 0:29:14 +[2023-10-13 20:26:09 simmim_pretrain](main_simmim.py 218): INFO Train: [188/200][0/6787] eta 2:54:05 lr 0.000200 time 1.5391 (1.5391) loss 0.3391 (0.3391) grad_norm 273062.9375 (273062.9375) mem 14543MB +[2023-10-13 20:28:14 simmim_pretrain](main_simmim.py 218): INFO Train: [188/200][500/6787] eta 0:26:28 lr 0.000200 time 0.2464 (0.2527) loss 0.3780 (0.3552) grad_norm 419497.4688 (370591.6875) mem 14543MB +[2023-10-13 20:30:19 simmim_pretrain](main_simmim.py 218): INFO Train: [188/200][1000/6787] eta 0:24:17 lr 0.000200 time 0.2523 (0.2519) loss 0.3454 (0.3552) grad_norm 464307.2188 (inf) mem 14543MB +[2023-10-13 20:32:25 simmim_pretrain](main_simmim.py 218): INFO Train: [188/200][1500/6787] eta 0:22:10 lr 0.000200 time 0.2525 (0.2517) loss 0.3534 (0.3554) grad_norm 277651.9062 (inf) mem 14543MB +[2023-10-13 20:34:31 simmim_pretrain](main_simmim.py 218): INFO Train: [188/200][2000/6787] eta 0:20:04 lr 0.000200 time 0.2511 (0.2515) loss 0.3667 (0.3554) grad_norm 487617.0000 (inf) mem 14543MB +[2023-10-13 20:36:36 simmim_pretrain](main_simmim.py 218): INFO Train: [188/200][2500/6787] eta 0:17:58 lr 0.000200 time 0.2559 (0.2515) loss 0.3490 (0.3554) grad_norm 323333.7500 (inf) mem 14543MB +[2023-10-13 20:38:42 simmim_pretrain](main_simmim.py 218): INFO Train: [188/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2493 (0.2516) loss 0.3668 (0.3551) grad_norm 201557.6719 (inf) mem 14543MB +[2023-10-13 20:40:48 simmim_pretrain](main_simmim.py 218): INFO Train: [188/200][3500/6787] eta 0:13:47 lr 0.000200 time 0.2510 (0.2516) loss 0.3528 (0.3555) grad_norm 288090.2500 (inf) mem 14543MB +[2023-10-13 20:42:55 simmim_pretrain](main_simmim.py 218): INFO Train: [188/200][4000/6787] eta 0:11:41 lr 0.000200 time 0.2502 (0.2518) loss 0.3497 (0.3556) grad_norm 362685.1250 (inf) mem 14543MB +[2023-10-13 20:45:01 simmim_pretrain](main_simmim.py 218): INFO Train: [188/200][4500/6787] eta 0:09:35 lr 0.000200 time 0.2475 (0.2519) loss 0.3604 (0.3561) grad_norm 236826.0781 (inf) mem 14543MB +[2023-10-13 20:47:07 simmim_pretrain](main_simmim.py 218): INFO Train: [188/200][5000/6787] eta 0:07:30 lr 0.000200 time 0.2497 (0.2520) loss 0.3697 (0.3564) grad_norm 159493.6719 (inf) mem 14543MB +[2023-10-13 20:49:14 simmim_pretrain](main_simmim.py 218): INFO Train: [188/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2567 (0.2521) loss 0.3813 (0.3566) grad_norm 121574.4219 (inf) mem 14543MB +[2023-10-13 20:51:24 simmim_pretrain](main_simmim.py 218): INFO Train: [188/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2604 (0.2528) loss 0.3527 (0.3568) grad_norm 137406.7500 (inf) mem 14543MB +[2023-10-13 20:53:34 simmim_pretrain](main_simmim.py 218): INFO Train: [188/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2609 (0.2533) loss 0.3362 (0.3569) grad_norm 158516.0938 (inf) mem 14543MB +[2023-10-13 20:54:49 simmim_pretrain](main_simmim.py 228): INFO EPOCH 188 training takes 0:28:42 +[2023-10-13 20:54:51 simmim_pretrain](main_simmim.py 218): INFO Train: [189/200][0/6787] eta 2:55:21 lr 0.000200 time 1.5502 (1.5502) loss 0.3753 (0.3753) grad_norm 213029.8750 (213029.8750) mem 14543MB +[2023-10-13 20:56:57 simmim_pretrain](main_simmim.py 218): INFO Train: [189/200][500/6787] eta 0:26:40 lr 0.000200 time 0.2498 (0.2545) loss 0.3873 (0.3582) grad_norm 196106.9375 (164673.0625) mem 14543MB +[2023-10-13 20:59:03 simmim_pretrain](main_simmim.py 218): INFO Train: [189/200][1000/6787] eta 0:24:26 lr 0.000200 time 0.2503 (0.2535) loss 0.3613 (0.3573) grad_norm 101362.5469 (167330.4062) mem 14543MB +[2023-10-13 21:01:09 simmim_pretrain](main_simmim.py 218): INFO Train: [189/200][1500/6787] eta 0:22:18 lr 0.000200 time 0.2533 (0.2531) loss 0.3648 (0.3570) grad_norm 204668.3750 (182612.4219) mem 14543MB +[2023-10-13 21:03:15 simmim_pretrain](main_simmim.py 218): INFO Train: [189/200][2000/6787] eta 0:20:10 lr 0.000200 time 0.2559 (0.2529) loss 0.3515 (0.3564) grad_norm 213556.3750 (220901.6406) mem 14543MB +[2023-10-13 21:05:21 simmim_pretrain](main_simmim.py 218): INFO Train: [189/200][2500/6787] eta 0:18:03 lr 0.000200 time 0.2458 (0.2527) loss 0.3596 (0.3565) grad_norm 338358.3125 (237615.6719) mem 14543MB +[2023-10-13 21:07:27 simmim_pretrain](main_simmim.py 218): INFO Train: [189/200][3000/6787] eta 0:15:56 lr 0.000200 time 0.2518 (0.2526) loss 0.3719 (0.3564) grad_norm 306506.5000 (253917.4375) mem 14543MB +[2023-10-13 21:09:33 simmim_pretrain](main_simmim.py 218): INFO Train: [189/200][3500/6787] eta 0:13:49 lr 0.000200 time 0.2520 (0.2524) loss 0.3631 (0.3562) grad_norm 173564.6406 (inf) mem 14543MB +[2023-10-13 21:11:39 simmim_pretrain](main_simmim.py 218): INFO Train: [189/200][4000/6787] eta 0:11:43 lr 0.000200 time 0.2589 (0.2523) loss 0.3540 (0.3560) grad_norm 248149.7188 (inf) mem 14543MB +[2023-10-13 21:13:45 simmim_pretrain](main_simmim.py 218): INFO Train: [189/200][4500/6787] eta 0:09:36 lr 0.000200 time 0.2499 (0.2522) loss 0.3646 (0.3561) grad_norm 147302.4688 (inf) mem 14543MB +[2023-10-13 21:15:50 simmim_pretrain](main_simmim.py 218): INFO Train: [189/200][5000/6787] eta 0:07:30 lr 0.000200 time 0.2498 (0.2521) loss 0.3300 (0.3562) grad_norm 201229.4688 (inf) mem 14543MB +[2023-10-13 21:17:56 simmim_pretrain](main_simmim.py 218): INFO Train: [189/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2473 (0.2520) loss 0.3655 (0.3562) grad_norm 155365.4062 (inf) mem 14543MB +[2023-10-13 21:20:02 simmim_pretrain](main_simmim.py 218): INFO Train: [189/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2486 (0.2520) loss 0.3461 (0.3562) grad_norm 219934.7344 (inf) mem 14543MB +[2023-10-13 21:22:08 simmim_pretrain](main_simmim.py 218): INFO Train: [189/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2486 (0.2520) loss 0.3544 (0.3562) grad_norm 340285.9375 (inf) mem 14543MB +[2023-10-13 21:23:21 simmim_pretrain](main_simmim.py 228): INFO EPOCH 189 training takes 0:28:31 +[2023-10-13 21:23:22 simmim_pretrain](main_simmim.py 218): INFO Train: [190/200][0/6787] eta 2:47:47 lr 0.000200 time 1.4833 (1.4833) loss 0.3549 (0.3549) grad_norm 461040.5938 (461040.5938) mem 14543MB +[2023-10-13 21:25:28 simmim_pretrain](main_simmim.py 218): INFO Train: [190/200][500/6787] eta 0:26:40 lr 0.000200 time 0.2508 (0.2545) loss 0.3486 (0.3555) grad_norm 215064.6562 (430273.7812) mem 14543MB +[2023-10-13 21:27:35 simmim_pretrain](main_simmim.py 218): INFO Train: [190/200][1000/6787] eta 0:24:30 lr 0.000200 time 0.2550 (0.2541) loss 0.3626 (0.3554) grad_norm 332880.7812 (382086.2812) mem 14543MB +[2023-10-13 21:29:43 simmim_pretrain](main_simmim.py 218): INFO Train: [190/200][1500/6787] eta 0:22:27 lr 0.000200 time 0.2614 (0.2548) loss 0.3505 (0.3559) grad_norm 192371.3438 (inf) mem 14543MB +[2023-10-13 21:31:53 simmim_pretrain](main_simmim.py 218): INFO Train: [190/200][2000/6787] eta 0:20:26 lr 0.000200 time 0.2606 (0.2561) loss 0.3514 (0.3561) grad_norm 161280.6250 (inf) mem 14543MB +[2023-10-13 21:34:03 simmim_pretrain](main_simmim.py 218): INFO Train: [190/200][2500/6787] eta 0:18:21 lr 0.000200 time 0.2607 (0.2570) loss 0.3565 (0.3562) grad_norm 227372.6562 (inf) mem 14543MB +[2023-10-13 21:36:13 simmim_pretrain](main_simmim.py 218): INFO Train: [190/200][3000/6787] eta 0:16:15 lr 0.000200 time 0.2610 (0.2575) loss 0.3653 (0.3564) grad_norm 254291.0469 (inf) mem 14543MB +[2023-10-13 21:38:23 simmim_pretrain](main_simmim.py 218): INFO Train: [190/200][3500/6787] eta 0:14:06 lr 0.000200 time 0.2578 (0.2576) loss 0.3707 (0.3563) grad_norm 338453.4062 (inf) mem 14543MB +[2023-10-13 21:40:31 simmim_pretrain](main_simmim.py 218): INFO Train: [190/200][4000/6787] eta 0:11:57 lr 0.000200 time 0.2583 (0.2576) loss 0.3496 (0.3561) grad_norm 501778.2188 (inf) mem 14543MB +[2023-10-13 21:42:39 simmim_pretrain](main_simmim.py 218): INFO Train: [190/200][4500/6787] eta 0:09:48 lr 0.000200 time 0.2537 (0.2575) loss 0.3437 (0.3560) grad_norm 259197.0156 (inf) mem 14543MB +[2023-10-13 21:44:47 simmim_pretrain](main_simmim.py 218): INFO Train: [190/200][5000/6787] eta 0:07:39 lr 0.000200 time 0.2534 (0.2572) loss 0.3572 (0.3561) grad_norm 269465.2188 (inf) mem 14543MB +[2023-10-13 21:46:54 simmim_pretrain](main_simmim.py 218): INFO Train: [190/200][5500/6787] eta 0:05:30 lr 0.000200 time 0.2550 (0.2568) loss 0.3593 (0.3563) grad_norm 215768.2656 (inf) mem 14543MB +[2023-10-13 21:49:02 simmim_pretrain](main_simmim.py 218): INFO Train: [190/200][6000/6787] eta 0:03:22 lr 0.000200 time 0.2560 (0.2568) loss 0.3516 (0.3563) grad_norm 355818.0000 (inf) mem 14543MB +[2023-10-13 21:51:11 simmim_pretrain](main_simmim.py 218): INFO Train: [190/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2606 (0.2570) loss 0.3378 (0.3563) grad_norm 325573.3750 (inf) mem 14543MB +[2023-10-13 21:52:26 simmim_pretrain](main_simmim.py 228): INFO EPOCH 190 training takes 0:29:05 +[2023-10-13 21:52:28 simmim_pretrain](main_simmim.py 218): INFO Train: [191/200][0/6787] eta 2:49:50 lr 0.000200 time 1.5015 (1.5015) loss 0.3556 (0.3556) grad_norm 149541.4844 (149541.4844) mem 14543MB +[2023-10-13 21:54:34 simmim_pretrain](main_simmim.py 218): INFO Train: [191/200][500/6787] eta 0:26:38 lr 0.000200 time 0.2570 (0.2542) loss 0.3558 (0.3555) grad_norm 366313.5625 (309579.9375) mem 14543MB +[2023-10-13 21:56:40 simmim_pretrain](main_simmim.py 218): INFO Train: [191/200][1000/6787] eta 0:24:24 lr 0.000200 time 0.2556 (0.2531) loss 0.3511 (0.3556) grad_norm 473457.6875 (352328.5938) mem 14543MB +[2023-10-13 21:58:45 simmim_pretrain](main_simmim.py 218): INFO Train: [191/200][1500/6787] eta 0:22:15 lr 0.000200 time 0.2529 (0.2526) loss 0.3497 (0.3557) grad_norm 276720.4375 (360801.7500) mem 14543MB +[2023-10-13 22:00:51 simmim_pretrain](main_simmim.py 218): INFO Train: [191/200][2000/6787] eta 0:20:08 lr 0.000200 time 0.2532 (0.2524) loss 0.3606 (0.3557) grad_norm 346433.0312 (379431.5625) mem 14543MB +[2023-10-13 22:02:57 simmim_pretrain](main_simmim.py 218): INFO Train: [191/200][2500/6787] eta 0:18:00 lr 0.000200 time 0.2505 (0.2521) loss 0.3647 (0.3556) grad_norm 614310.3750 (inf) mem 14543MB +[2023-10-13 22:05:03 simmim_pretrain](main_simmim.py 218): INFO Train: [191/200][3000/6787] eta 0:15:54 lr 0.000200 time 0.2525 (0.2521) loss 0.3710 (0.3556) grad_norm 207754.6562 (inf) mem 14543MB +[2023-10-13 22:07:09 simmim_pretrain](main_simmim.py 218): INFO Train: [191/200][3500/6787] eta 0:13:48 lr 0.000200 time 0.2523 (0.2521) loss 0.3568 (0.3553) grad_norm 350684.7812 (inf) mem 14543MB +[2023-10-13 22:09:15 simmim_pretrain](main_simmim.py 218): INFO Train: [191/200][4000/6787] eta 0:11:42 lr 0.000200 time 0.2524 (0.2520) loss 0.3558 (0.3553) grad_norm 399651.0625 (inf) mem 14543MB +[2023-10-13 22:11:21 simmim_pretrain](main_simmim.py 218): INFO Train: [191/200][4500/6787] eta 0:09:36 lr 0.000200 time 0.2531 (0.2520) loss 0.3522 (0.3552) grad_norm 431535.2188 (inf) mem 14543MB +[2023-10-13 22:13:27 simmim_pretrain](main_simmim.py 218): INFO Train: [191/200][5000/6787] eta 0:07:30 lr 0.000200 time 0.2496 (0.2521) loss 0.3541 (0.3552) grad_norm 548871.8125 (inf) mem 14543MB +[2023-10-13 22:15:33 simmim_pretrain](main_simmim.py 218): INFO Train: [191/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2545 (0.2522) loss 0.3605 (0.3552) grad_norm 521067.8750 (inf) mem 14543MB +[2023-10-13 22:17:40 simmim_pretrain](main_simmim.py 218): INFO Train: [191/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2591 (0.2523) loss 0.3404 (0.3551) grad_norm 401392.2812 (inf) mem 14543MB +[2023-10-13 22:19:49 simmim_pretrain](main_simmim.py 218): INFO Train: [191/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2488 (0.2527) loss 0.3548 (0.3552) grad_norm 521799.0625 (inf) mem 14543MB +[2023-10-13 22:21:03 simmim_pretrain](main_simmim.py 228): INFO EPOCH 191 training takes 0:28:36 +[2023-10-13 22:21:04 simmim_pretrain](main_simmim.py 218): INFO Train: [192/200][0/6787] eta 2:47:37 lr 0.000200 time 1.4819 (1.4819) loss 0.3454 (0.3454) grad_norm 344808.4062 (344808.4062) mem 14543MB +[2023-10-13 22:23:11 simmim_pretrain](main_simmim.py 218): INFO Train: [192/200][500/6787] eta 0:26:49 lr 0.000200 time 0.2547 (0.2559) loss 0.3241 (0.3555) grad_norm 376958.5000 (nan) mem 14543MB +[2023-10-13 22:25:18 simmim_pretrain](main_simmim.py 218): INFO Train: [192/200][1000/6787] eta 0:24:35 lr 0.000200 time 0.2554 (0.2550) loss 0.3508 (0.3562) grad_norm 382849.2500 (nan) mem 14543MB +[2023-10-13 22:27:26 simmim_pretrain](main_simmim.py 218): INFO Train: [192/200][1500/6787] eta 0:22:29 lr 0.000200 time 0.2555 (0.2553) loss 0.3584 (0.3565) grad_norm 196838.1719 (nan) mem 14543MB +[2023-10-13 22:29:34 simmim_pretrain](main_simmim.py 218): INFO Train: [192/200][2000/6787] eta 0:20:22 lr 0.000200 time 0.2591 (0.2555) loss 0.3198 (0.3567) grad_norm 351823.6875 (nan) mem 14543MB +[2023-10-13 22:31:42 simmim_pretrain](main_simmim.py 218): INFO Train: [192/200][2500/6787] eta 0:18:14 lr 0.000200 time 0.2484 (0.2554) loss 0.3607 (0.3567) grad_norm 386591.3125 (nan) mem 14543MB +[2023-10-13 22:33:48 simmim_pretrain](main_simmim.py 218): INFO Train: [192/200][3000/6787] eta 0:16:05 lr 0.000200 time 0.2525 (0.2549) loss 0.3527 (0.3570) grad_norm 348804.7812 (nan) mem 14543MB +[2023-10-13 22:35:55 simmim_pretrain](main_simmim.py 218): INFO Train: [192/200][3500/6787] eta 0:13:57 lr 0.000200 time 0.2525 (0.2548) loss 0.3692 (0.3570) grad_norm 431138.1875 (nan) mem 14543MB +[2023-10-13 22:38:02 simmim_pretrain](main_simmim.py 218): INFO Train: [192/200][4000/6787] eta 0:11:50 lr 0.000200 time 0.2527 (0.2548) loss 0.3463 (0.3574) grad_norm 212839.6562 (nan) mem 14543MB +[2023-10-13 22:40:10 simmim_pretrain](main_simmim.py 218): INFO Train: [192/200][4500/6787] eta 0:09:42 lr 0.000200 time 0.2592 (0.2548) loss 0.3679 (0.3576) grad_norm 129881.5234 (nan) mem 14543MB +[2023-10-13 22:42:17 simmim_pretrain](main_simmim.py 218): INFO Train: [192/200][5000/6787] eta 0:07:35 lr 0.000200 time 0.2544 (0.2547) loss 0.3476 (0.3578) grad_norm 98268.8984 (nan) mem 14543MB +[2023-10-13 22:44:24 simmim_pretrain](main_simmim.py 218): INFO Train: [192/200][5500/6787] eta 0:05:27 lr 0.000200 time 0.2532 (0.2546) loss 0.3521 (0.3579) grad_norm 127697.8984 (nan) mem 14543MB +[2023-10-13 22:46:30 simmim_pretrain](main_simmim.py 218): INFO Train: [192/200][6000/6787] eta 0:03:20 lr 0.000200 time 0.2520 (0.2545) loss 0.3685 (0.3580) grad_norm 287933.2812 (nan) mem 14543MB +[2023-10-13 22:48:37 simmim_pretrain](main_simmim.py 218): INFO Train: [192/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2670 (0.2545) loss 0.3481 (0.3579) grad_norm 139591.9688 (nan) mem 14543MB +[2023-10-13 22:49:51 simmim_pretrain](main_simmim.py 228): INFO EPOCH 192 training takes 0:28:47 +[2023-10-13 22:49:52 simmim_pretrain](main_simmim.py 218): INFO Train: [193/200][0/6787] eta 2:59:08 lr 0.000200 time 1.5836 (1.5836) loss 0.3680 (0.3680) grad_norm 275102.9375 (275102.9375) mem 14543MB +[2023-10-13 22:51:58 simmim_pretrain](main_simmim.py 218): INFO Train: [193/200][500/6787] eta 0:26:32 lr 0.000200 time 0.2488 (0.2534) loss 0.3592 (0.3567) grad_norm 83196.3984 (220798.6875) mem 14543MB +[2023-10-13 22:54:03 simmim_pretrain](main_simmim.py 218): INFO Train: [193/200][1000/6787] eta 0:24:20 lr 0.000200 time 0.2491 (0.2523) loss 0.3747 (0.3567) grad_norm 225868.1094 (inf) mem 14543MB +[2023-10-13 22:56:09 simmim_pretrain](main_simmim.py 218): INFO Train: [193/200][1500/6787] eta 0:22:14 lr 0.000200 time 0.2587 (0.2523) loss 0.3597 (0.3566) grad_norm 255076.7656 (inf) mem 14543MB +[2023-10-13 22:58:16 simmim_pretrain](main_simmim.py 218): INFO Train: [193/200][2000/6787] eta 0:20:08 lr 0.000200 time 0.2533 (0.2524) loss 0.3727 (0.3567) grad_norm 349562.0625 (inf) mem 14543MB +[2023-10-13 23:00:22 simmim_pretrain](main_simmim.py 218): INFO Train: [193/200][2500/6787] eta 0:18:02 lr 0.000200 time 0.2521 (0.2525) loss 0.3688 (0.3566) grad_norm 237473.4375 (inf) mem 14543MB +[2023-10-13 23:02:29 simmim_pretrain](main_simmim.py 218): INFO Train: [193/200][3000/6787] eta 0:15:56 lr 0.000200 time 0.2535 (0.2526) loss 0.3460 (0.3567) grad_norm 256213.2969 (inf) mem 14543MB +[2023-10-13 23:04:35 simmim_pretrain](main_simmim.py 218): INFO Train: [193/200][3500/6787] eta 0:13:50 lr 0.000200 time 0.2523 (0.2526) loss 0.3469 (0.3566) grad_norm 325910.7500 (inf) mem 14543MB +[2023-10-13 23:06:42 simmim_pretrain](main_simmim.py 218): INFO Train: [193/200][4000/6787] eta 0:11:44 lr 0.000200 time 0.2497 (0.2528) loss 0.3539 (0.3566) grad_norm 326580.2500 (inf) mem 14543MB +[2023-10-13 23:08:49 simmim_pretrain](main_simmim.py 218): INFO Train: [193/200][4500/6787] eta 0:09:38 lr 0.000200 time 0.2595 (0.2529) loss 0.3620 (0.3567) grad_norm 511985.5938 (inf) mem 14543MB +[2023-10-13 23:10:56 simmim_pretrain](main_simmim.py 218): INFO Train: [193/200][5000/6787] eta 0:07:32 lr 0.000200 time 0.2581 (0.2530) loss 0.3690 (0.3566) grad_norm 230771.1406 (inf) mem 14543MB +[2023-10-13 23:13:06 simmim_pretrain](main_simmim.py 218): INFO Train: [193/200][5500/6787] eta 0:05:26 lr 0.000200 time 0.2574 (0.2536) loss 0.3772 (0.3566) grad_norm 206221.6406 (inf) mem 14543MB +[2023-10-13 23:15:16 simmim_pretrain](main_simmim.py 218): INFO Train: [193/200][6000/6787] eta 0:03:20 lr 0.000200 time 0.2594 (0.2542) loss 0.3603 (0.3566) grad_norm 444114.5625 (inf) mem 14543MB +[2023-10-13 23:17:26 simmim_pretrain](main_simmim.py 218): INFO Train: [193/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2514 (0.2546) loss 0.3502 (0.3566) grad_norm 208559.3594 (inf) mem 14543MB +[2023-10-13 23:18:41 simmim_pretrain](main_simmim.py 228): INFO EPOCH 193 training takes 0:28:50 +[2023-10-13 23:18:43 simmim_pretrain](main_simmim.py 218): INFO Train: [194/200][0/6787] eta 2:55:35 lr 0.000200 time 1.5523 (1.5523) loss 0.3652 (0.3652) grad_norm 205349.2188 (205349.2188) mem 14543MB +[2023-10-13 23:20:48 simmim_pretrain](main_simmim.py 218): INFO Train: [194/200][500/6787] eta 0:26:37 lr 0.000200 time 0.2505 (0.2540) loss 0.3431 (0.3554) grad_norm 492198.5000 (405991.1250) mem 14543MB +[2023-10-13 23:22:54 simmim_pretrain](main_simmim.py 218): INFO Train: [194/200][1000/6787] eta 0:24:22 lr 0.000200 time 0.2566 (0.2528) loss 0.3482 (0.3545) grad_norm 363155.5625 (inf) mem 14543MB +[2023-10-13 23:25:00 simmim_pretrain](main_simmim.py 218): INFO Train: [194/200][1500/6787] eta 0:22:15 lr 0.000200 time 0.2500 (0.2526) loss 0.3323 (0.3549) grad_norm 292055.9375 (inf) mem 14543MB +[2023-10-13 23:27:06 simmim_pretrain](main_simmim.py 218): INFO Train: [194/200][2000/6787] eta 0:20:08 lr 0.000200 time 0.2534 (0.2525) loss 0.3583 (0.3561) grad_norm 162312.2188 (inf) mem 14543MB +[2023-10-13 23:29:12 simmim_pretrain](main_simmim.py 218): INFO Train: [194/200][2500/6787] eta 0:18:01 lr 0.000200 time 0.2545 (0.2524) loss 0.3259 (0.3568) grad_norm 81559.5859 (inf) mem 14543MB +[2023-10-13 23:31:18 simmim_pretrain](main_simmim.py 218): INFO Train: [194/200][3000/6787] eta 0:15:55 lr 0.000200 time 0.2588 (0.2523) loss 0.3535 (0.3572) grad_norm 146313.4688 (inf) mem 14543MB +[2023-10-13 23:33:24 simmim_pretrain](main_simmim.py 218): INFO Train: [194/200][3500/6787] eta 0:13:48 lr 0.000200 time 0.2511 (0.2522) loss 0.3351 (0.3575) grad_norm 96472.0469 (inf) mem 14543MB +[2023-10-13 23:35:30 simmim_pretrain](main_simmim.py 218): INFO Train: [194/200][4000/6787] eta 0:11:42 lr 0.000200 time 0.2548 (0.2521) loss 0.3509 (0.3578) grad_norm 187762.5469 (inf) mem 14543MB +[2023-10-13 23:37:36 simmim_pretrain](main_simmim.py 218): INFO Train: [194/200][4500/6787] eta 0:09:36 lr 0.000200 time 0.2474 (0.2521) loss 0.3610 (0.3576) grad_norm 105905.7109 (inf) mem 14543MB +[2023-10-13 23:39:42 simmim_pretrain](main_simmim.py 218): INFO Train: [194/200][5000/6787] eta 0:07:30 lr 0.000200 time 0.2551 (0.2520) loss 0.3624 (0.3575) grad_norm 294943.5938 (inf) mem 14543MB +[2023-10-13 23:41:48 simmim_pretrain](main_simmim.py 218): INFO Train: [194/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2506 (0.2521) loss 0.3491 (0.3575) grad_norm 188075.0781 (inf) mem 14543MB +[2023-10-13 23:43:54 simmim_pretrain](main_simmim.py 218): INFO Train: [194/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2591 (0.2522) loss 0.3640 (0.3573) grad_norm 393169.8125 (inf) mem 14543MB +[2023-10-13 23:46:01 simmim_pretrain](main_simmim.py 218): INFO Train: [194/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2490 (0.2522) loss 0.3753 (0.3572) grad_norm 255751.1562 (inf) mem 14543MB +[2023-10-13 23:47:14 simmim_pretrain](main_simmim.py 228): INFO EPOCH 194 training takes 0:28:32 +[2023-10-13 23:47:15 simmim_pretrain](main_simmim.py 218): INFO Train: [195/200][0/6787] eta 2:56:02 lr 0.000200 time 1.5563 (1.5563) loss 0.3615 (0.3615) grad_norm 336096.2188 (336096.2188) mem 14543MB +[2023-10-13 23:49:22 simmim_pretrain](main_simmim.py 218): INFO Train: [195/200][500/6787] eta 0:26:43 lr 0.000200 time 0.2523 (0.2551) loss 0.3388 (0.3559) grad_norm 507365.3750 (378207.1875) mem 14543MB +[2023-10-13 23:51:28 simmim_pretrain](main_simmim.py 218): INFO Train: [195/200][1000/6787] eta 0:24:29 lr 0.000200 time 0.2502 (0.2539) loss 0.3607 (0.3554) grad_norm 544352.3750 (inf) mem 14543MB +[2023-10-13 23:53:36 simmim_pretrain](main_simmim.py 218): INFO Train: [195/200][1500/6787] eta 0:22:27 lr 0.000200 time 0.2593 (0.2548) loss 0.3520 (0.3558) grad_norm 189888.5469 (inf) mem 14543MB +[2023-10-13 23:55:46 simmim_pretrain](main_simmim.py 218): INFO Train: [195/200][2000/6787] eta 0:20:26 lr 0.000200 time 0.2595 (0.2562) loss 0.3638 (0.3560) grad_norm 131617.6719 (inf) mem 14543MB +[2023-10-13 23:57:56 simmim_pretrain](main_simmim.py 218): INFO Train: [195/200][2500/6787] eta 0:18:21 lr 0.000200 time 0.2497 (0.2569) loss 0.3335 (0.3562) grad_norm 218092.7656 (inf) mem 14543MB +[2023-10-14 00:00:03 simmim_pretrain](main_simmim.py 218): INFO Train: [195/200][3000/6787] eta 0:16:11 lr 0.000200 time 0.2558 (0.2565) loss 0.3648 (0.3563) grad_norm 384229.4375 (inf) mem 14543MB +[2023-10-14 00:02:13 simmim_pretrain](main_simmim.py 218): INFO Train: [195/200][3500/6787] eta 0:14:04 lr 0.000200 time 0.2517 (0.2568) loss 0.3614 (0.3562) grad_norm 301729.7500 (inf) mem 14543MB +[2023-10-14 00:04:23 simmim_pretrain](main_simmim.py 218): INFO Train: [195/200][4000/6787] eta 0:11:56 lr 0.000200 time 0.2609 (0.2572) loss 0.3640 (0.3561) grad_norm 209223.8438 (inf) mem 14543MB +[2023-10-14 00:06:33 simmim_pretrain](main_simmim.py 218): INFO Train: [195/200][4500/6787] eta 0:09:48 lr 0.000200 time 0.2608 (0.2575) loss 0.3540 (0.3561) grad_norm 596011.0625 (inf) mem 14543MB +[2023-10-14 00:08:43 simmim_pretrain](main_simmim.py 218): INFO Train: [195/200][5000/6787] eta 0:07:40 lr 0.000200 time 0.2611 (0.2578) loss 0.3551 (0.3560) grad_norm 545414.4375 (inf) mem 14543MB +[2023-10-14 00:10:53 simmim_pretrain](main_simmim.py 218): INFO Train: [195/200][5500/6787] eta 0:05:32 lr 0.000200 time 0.2615 (0.2580) loss 0.3380 (0.3559) grad_norm 416826.4062 (inf) mem 14543MB +[2023-10-14 00:13:03 simmim_pretrain](main_simmim.py 218): INFO Train: [195/200][6000/6787] eta 0:03:23 lr 0.000200 time 0.2591 (0.2581) loss 0.3781 (0.3558) grad_norm 364557.1875 (inf) mem 14543MB +[2023-10-14 00:15:13 simmim_pretrain](main_simmim.py 218): INFO Train: [195/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2579 (0.2583) loss 0.3529 (0.3559) grad_norm 206967.9531 (inf) mem 14543MB +[2023-10-14 00:16:28 simmim_pretrain](main_simmim.py 228): INFO EPOCH 195 training takes 0:29:14 +[2023-10-14 00:16:29 simmim_pretrain](main_simmim.py 218): INFO Train: [196/200][0/6787] eta 2:50:55 lr 0.000200 time 1.5110 (1.5110) loss 0.3571 (0.3571) grad_norm 364913.9062 (364913.9062) mem 14543MB +[2023-10-14 00:18:35 simmim_pretrain](main_simmim.py 218): INFO Train: [196/200][500/6787] eta 0:26:27 lr 0.000200 time 0.2475 (0.2525) loss 0.3715 (0.3576) grad_norm 316566.9688 (248606.1875) mem 14543MB +[2023-10-14 00:20:40 simmim_pretrain](main_simmim.py 218): INFO Train: [196/200][1000/6787] eta 0:24:18 lr 0.000200 time 0.2529 (0.2520) loss 0.3543 (0.3572) grad_norm 158194.5000 (245907.6250) mem 14543MB +[2023-10-14 00:22:46 simmim_pretrain](main_simmim.py 218): INFO Train: [196/200][1500/6787] eta 0:22:12 lr 0.000200 time 0.2578 (0.2521) loss 0.3476 (0.3565) grad_norm 320833.1250 (268958.8438) mem 14543MB +[2023-10-14 00:24:54 simmim_pretrain](main_simmim.py 218): INFO Train: [196/200][2000/6787] eta 0:20:10 lr 0.000200 time 0.2567 (0.2529) loss 0.3673 (0.3561) grad_norm 323179.0938 (nan) mem 14543MB +[2023-10-14 00:27:02 simmim_pretrain](main_simmim.py 218): INFO Train: [196/200][2500/6787] eta 0:18:07 lr 0.000200 time 0.2552 (0.2536) loss 0.3812 (0.3567) grad_norm 241881.3281 (nan) mem 14543MB +[2023-10-14 00:29:11 simmim_pretrain](main_simmim.py 218): INFO Train: [196/200][3000/6787] eta 0:16:02 lr 0.000200 time 0.2585 (0.2542) loss 0.3568 (0.3573) grad_norm 111176.3828 (nan) mem 14543MB +[2023-10-14 00:31:20 simmim_pretrain](main_simmim.py 218): INFO Train: [196/200][3500/6787] eta 0:13:57 lr 0.000200 time 0.2568 (0.2547) loss 0.3646 (0.3576) grad_norm 116309.4922 (nan) mem 14543MB +[2023-10-14 00:33:29 simmim_pretrain](main_simmim.py 218): INFO Train: [196/200][4000/6787] eta 0:11:50 lr 0.000200 time 0.2616 (0.2551) loss 0.3513 (0.3579) grad_norm 99959.5625 (nan) mem 14543MB +[2023-10-14 00:35:37 simmim_pretrain](main_simmim.py 218): INFO Train: [196/200][4500/6787] eta 0:09:43 lr 0.000200 time 0.2496 (0.2553) loss 0.3656 (0.3581) grad_norm 207295.3438 (nan) mem 14543MB +[2023-10-14 00:37:46 simmim_pretrain](main_simmim.py 218): INFO Train: [196/200][5000/6787] eta 0:07:36 lr 0.000200 time 0.2599 (0.2555) loss 0.3639 (0.3582) grad_norm 158706.2656 (nan) mem 14543MB +[2023-10-14 00:39:54 simmim_pretrain](main_simmim.py 218): INFO Train: [196/200][5500/6787] eta 0:05:28 lr 0.000200 time 0.2604 (0.2556) loss 0.3473 (0.3583) grad_norm 168193.1250 (nan) mem 14543MB +[2023-10-14 00:42:02 simmim_pretrain](main_simmim.py 218): INFO Train: [196/200][6000/6787] eta 0:03:21 lr 0.000200 time 0.2597 (0.2557) loss 0.3799 (0.3584) grad_norm 149597.2656 (nan) mem 14543MB +[2023-10-14 00:44:10 simmim_pretrain](main_simmim.py 218): INFO Train: [196/200][6500/6787] eta 0:01:13 lr 0.000200 time 0.2588 (0.2557) loss 0.3681 (0.3584) grad_norm 88202.3984 (nan) mem 14543MB +[2023-10-14 00:45:24 simmim_pretrain](main_simmim.py 228): INFO EPOCH 196 training takes 0:28:56 +[2023-10-14 00:45:25 simmim_pretrain](main_simmim.py 218): INFO Train: [197/200][0/6787] eta 2:38:22 lr 0.000200 time 1.4001 (1.4001) loss 0.3577 (0.3577) grad_norm 115283.4844 (115283.4844) mem 14543MB +[2023-10-14 00:47:32 simmim_pretrain](main_simmim.py 218): INFO Train: [197/200][500/6787] eta 0:26:41 lr 0.000200 time 0.2516 (0.2547) loss 0.3674 (0.3577) grad_norm 270207.6875 (156907.6094) mem 14543MB +[2023-10-14 00:49:38 simmim_pretrain](main_simmim.py 218): INFO Train: [197/200][1000/6787] eta 0:24:25 lr 0.000200 time 0.2525 (0.2533) loss 0.3573 (0.3577) grad_norm 345715.2500 (170754.7500) mem 14543MB +[2023-10-14 00:51:43 simmim_pretrain](main_simmim.py 218): INFO Train: [197/200][1500/6787] eta 0:22:14 lr 0.000200 time 0.2466 (0.2524) loss 0.3511 (0.3572) grad_norm 145732.6094 (180641.3438) mem 14543MB +[2023-10-14 00:53:48 simmim_pretrain](main_simmim.py 218): INFO Train: [197/200][2000/6787] eta 0:20:06 lr 0.000200 time 0.2483 (0.2520) loss 0.3650 (0.3571) grad_norm 194416.2344 (191247.9375) mem 14543MB +[2023-10-14 00:55:53 simmim_pretrain](main_simmim.py 218): INFO Train: [197/200][2500/6787] eta 0:17:58 lr 0.000200 time 0.2588 (0.2517) loss 0.3533 (0.3570) grad_norm 440575.4062 (216641.0312) mem 14543MB +[2023-10-14 00:57:59 simmim_pretrain](main_simmim.py 218): INFO Train: [197/200][3000/6787] eta 0:15:52 lr 0.000200 time 0.2589 (0.2515) loss 0.3496 (0.3568) grad_norm 375312.9688 (234518.9531) mem 14543MB +[2023-10-14 01:00:04 simmim_pretrain](main_simmim.py 218): INFO Train: [197/200][3500/6787] eta 0:13:45 lr 0.000200 time 0.2517 (0.2513) loss 0.3484 (0.3565) grad_norm 427771.4688 (252147.2969) mem 14543MB +[2023-10-14 01:02:09 simmim_pretrain](main_simmim.py 218): INFO Train: [197/200][4000/6787] eta 0:11:39 lr 0.000200 time 0.2457 (0.2511) loss 0.3504 (0.3564) grad_norm 629555.3750 (272102.0000) mem 14543MB +[2023-10-14 01:04:14 simmim_pretrain](main_simmim.py 218): INFO Train: [197/200][4500/6787] eta 0:09:34 lr 0.000200 time 0.2477 (0.2510) loss 0.3545 (0.3562) grad_norm 263213.4688 (inf) mem 14543MB +[2023-10-14 01:06:19 simmim_pretrain](main_simmim.py 218): INFO Train: [197/200][5000/6787] eta 0:07:28 lr 0.000200 time 0.2599 (0.2509) loss 0.3531 (0.3561) grad_norm 403320.0625 (inf) mem 14543MB +[2023-10-14 01:08:24 simmim_pretrain](main_simmim.py 218): INFO Train: [197/200][5500/6787] eta 0:05:22 lr 0.000200 time 0.2526 (0.2509) loss 0.3437 (0.3561) grad_norm 232945.0156 (inf) mem 14543MB +[2023-10-14 01:10:30 simmim_pretrain](main_simmim.py 218): INFO Train: [197/200][6000/6787] eta 0:03:17 lr 0.000200 time 0.2561 (0.2510) loss 0.3618 (0.3562) grad_norm 292770.2500 (inf) mem 14543MB +[2023-10-14 01:12:36 simmim_pretrain](main_simmim.py 218): INFO Train: [197/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2545 (0.2510) loss 0.3767 (0.3564) grad_norm 321448.1250 (inf) mem 14543MB +[2023-10-14 01:13:49 simmim_pretrain](main_simmim.py 228): INFO EPOCH 197 training takes 0:28:24 +[2023-10-14 01:13:50 simmim_pretrain](main_simmim.py 218): INFO Train: [198/200][0/6787] eta 2:44:17 lr 0.000200 time 1.4524 (1.4524) loss 0.3634 (0.3634) grad_norm 302900.5000 (302900.5000) mem 14543MB +[2023-10-14 01:15:56 simmim_pretrain](main_simmim.py 218): INFO Train: [198/200][500/6787] eta 0:26:39 lr 0.000200 time 0.2513 (0.2544) loss 0.3636 (0.3584) grad_norm 142350.5781 (inf) mem 14543MB +[2023-10-14 01:18:03 simmim_pretrain](main_simmim.py 218): INFO Train: [198/200][1000/6787] eta 0:24:27 lr 0.000200 time 0.2539 (0.2536) loss 0.3589 (0.3592) grad_norm 130047.0938 (inf) mem 14543MB +[2023-10-14 01:20:09 simmim_pretrain](main_simmim.py 218): INFO Train: [198/200][1500/6787] eta 0:22:20 lr 0.000200 time 0.2552 (0.2535) loss 0.3624 (0.3596) grad_norm 149347.1875 (inf) mem 14543MB +[2023-10-14 01:22:17 simmim_pretrain](main_simmim.py 218): INFO Train: [198/200][2000/6787] eta 0:20:15 lr 0.000200 time 0.2616 (0.2538) loss 0.3522 (0.3607) grad_norm 79951.1406 (inf) mem 14543MB +[2023-10-14 01:24:27 simmim_pretrain](main_simmim.py 218): INFO Train: [198/200][2500/6787] eta 0:18:13 lr 0.000200 time 0.2607 (0.2550) loss 0.3635 (0.3611) grad_norm 51467.3008 (inf) mem 14543MB +[2023-10-14 01:26:37 simmim_pretrain](main_simmim.py 218): INFO Train: [198/200][3000/6787] eta 0:16:09 lr 0.000200 time 0.2605 (0.2559) loss 0.3345 (0.3611) grad_norm 24979.0566 (inf) mem 14543MB +[2023-10-14 01:28:47 simmim_pretrain](main_simmim.py 218): INFO Train: [198/200][3500/6787] eta 0:14:02 lr 0.000200 time 0.2604 (0.2565) loss 0.3515 (0.3613) grad_norm 68689.3516 (inf) mem 14543MB +[2023-10-14 01:30:57 simmim_pretrain](main_simmim.py 218): INFO Train: [198/200][4000/6787] eta 0:11:55 lr 0.000200 time 0.2608 (0.2569) loss 0.3564 (0.3611) grad_norm 133474.7812 (inf) mem 14543MB +[2023-10-14 01:33:07 simmim_pretrain](main_simmim.py 218): INFO Train: [198/200][4500/6787] eta 0:09:48 lr 0.000200 time 0.2603 (0.2572) loss 0.3701 (0.3609) grad_norm 66461.5703 (inf) mem 14543MB +[2023-10-14 01:35:17 simmim_pretrain](main_simmim.py 218): INFO Train: [198/200][5000/6787] eta 0:07:40 lr 0.000200 time 0.2607 (0.2575) loss 0.3963 (0.3606) grad_norm 152168.8750 (inf) mem 14543MB +[2023-10-14 01:37:26 simmim_pretrain](main_simmim.py 218): INFO Train: [198/200][5500/6787] eta 0:05:31 lr 0.000200 time 0.2612 (0.2577) loss 0.3558 (0.3605) grad_norm 103547.5781 (inf) mem 14543MB +[2023-10-14 01:39:36 simmim_pretrain](main_simmim.py 218): INFO Train: [198/200][6000/6787] eta 0:03:22 lr 0.000200 time 0.2605 (0.2579) loss 0.3301 (0.3603) grad_norm 133191.2656 (inf) mem 14543MB +[2023-10-14 01:41:46 simmim_pretrain](main_simmim.py 218): INFO Train: [198/200][6500/6787] eta 0:01:14 lr 0.000200 time 0.2598 (0.2580) loss 0.3534 (0.3600) grad_norm 236418.2812 (inf) mem 14543MB +[2023-10-14 01:43:01 simmim_pretrain](main_simmim.py 228): INFO EPOCH 198 training takes 0:29:12 +[2023-10-14 01:43:03 simmim_pretrain](main_simmim.py 218): INFO Train: [199/200][0/6787] eta 3:05:32 lr 0.000200 time 1.6403 (1.6403) loss 0.3670 (0.3670) grad_norm 124683.3828 (124683.3828) mem 14543MB +[2023-10-14 01:45:08 simmim_pretrain](main_simmim.py 218): INFO Train: [199/200][500/6787] eta 0:26:26 lr 0.000200 time 0.2516 (0.2524) loss 0.3567 (0.3573) grad_norm 139540.4375 (181335.5781) mem 14543MB +[2023-10-14 01:47:13 simmim_pretrain](main_simmim.py 218): INFO Train: [199/200][1000/6787] eta 0:24:15 lr 0.000200 time 0.2451 (0.2514) loss 0.3620 (0.3570) grad_norm 190056.2344 (182440.2812) mem 14543MB +[2023-10-14 01:49:18 simmim_pretrain](main_simmim.py 218): INFO Train: [199/200][1500/6787] eta 0:22:07 lr 0.000200 time 0.2587 (0.2511) loss 0.3333 (0.3563) grad_norm 268888.2812 (202905.9062) mem 14543MB +[2023-10-14 01:51:24 simmim_pretrain](main_simmim.py 218): INFO Train: [199/200][2000/6787] eta 0:20:02 lr 0.000200 time 0.2496 (0.2511) loss 0.3661 (0.3561) grad_norm 511422.1250 (223033.3906) mem 14543MB +[2023-10-14 01:53:29 simmim_pretrain](main_simmim.py 218): INFO Train: [199/200][2500/6787] eta 0:17:56 lr 0.000200 time 0.2463 (0.2511) loss 0.3614 (0.3558) grad_norm 254368.5312 (243853.5938) mem 14543MB +[2023-10-14 01:55:35 simmim_pretrain](main_simmim.py 218): INFO Train: [199/200][3000/6787] eta 0:15:51 lr 0.000200 time 0.2571 (0.2512) loss 0.3416 (0.3557) grad_norm 422309.1250 (264830.0312) mem 14543MB +[2023-10-14 01:57:41 simmim_pretrain](main_simmim.py 218): INFO Train: [199/200][3500/6787] eta 0:13:46 lr 0.000200 time 0.2486 (0.2514) loss 0.3685 (0.3556) grad_norm 418466.9688 (inf) mem 14543MB +[2023-10-14 01:59:47 simmim_pretrain](main_simmim.py 218): INFO Train: [199/200][4000/6787] eta 0:11:40 lr 0.000200 time 0.2596 (0.2515) loss 0.3498 (0.3556) grad_norm 545823.3750 (inf) mem 14543MB +[2023-10-14 02:01:54 simmim_pretrain](main_simmim.py 218): INFO Train: [199/200][4500/6787] eta 0:09:35 lr 0.000200 time 0.2592 (0.2516) loss 0.3615 (0.3554) grad_norm 556156.5625 (inf) mem 14543MB +[2023-10-14 02:04:00 simmim_pretrain](main_simmim.py 218): INFO Train: [199/200][5000/6787] eta 0:07:29 lr 0.000200 time 0.2543 (0.2517) loss 0.3322 (0.3553) grad_norm 203762.3125 (nan) mem 14543MB +[2023-10-14 02:06:06 simmim_pretrain](main_simmim.py 218): INFO Train: [199/200][5500/6787] eta 0:05:24 lr 0.000200 time 0.2561 (0.2518) loss 0.3517 (0.3554) grad_norm 353202.0312 (nan) mem 14543MB +[2023-10-14 02:08:15 simmim_pretrain](main_simmim.py 218): INFO Train: [199/200][6000/6787] eta 0:03:18 lr 0.000200 time 0.2598 (0.2522) loss 0.3787 (0.3556) grad_norm 189523.1719 (nan) mem 14543MB +[2023-10-14 02:10:24 simmim_pretrain](main_simmim.py 218): INFO Train: [199/200][6500/6787] eta 0:01:12 lr 0.000200 time 0.2597 (0.2528) loss 0.3604 (0.3558) grad_norm 114084.1875 (nan) mem 14543MB +[2023-10-14 02:11:39 simmim_pretrain](main_simmim.py 228): INFO EPOCH 199 training takes 0:28:38 +[2023-10-14 02:11:39 simmim_pretrain](utils.py 62): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_199.pth saving...... +[2023-10-14 02:11:40 simmim_pretrain](utils.py 64): INFO /root/autodl-tmp/LSQ-simmim/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_199.pth saved !!! +[2023-10-14 02:11:40 simmim_pretrain](main_simmim.py 156): INFO Training time 3 days, 23:07:37 diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/optimizer.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..45bbeaed4927cf86b06619ca435a53db9526af07 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/optimizer.py @@ -0,0 +1,191 @@ +# -------------------------------------------------------- +# SimMIM +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu +# Modified by Zhenda Xie +# -------------------------------------------------------- + +import json +from functools import partial +from torch import optim as optim + + +def build_optimizer(config, model, logger, is_pretrain): + if is_pretrain: + return build_pretrain_optimizer(config, model, logger) + else: + return build_finetune_optimizer(config, model, logger) + + +def build_pretrain_optimizer(config, model, logger): + logger.info('>>>>>>>>>> Build Optimizer for Pre-training Stage') + skip = {} + skip_keywords = {} + if hasattr(model, 'no_weight_decay'): + skip = model.no_weight_decay() + logger.info(f'No weight decay: {skip}') + if hasattr(model, 'no_weight_decay_keywords'): + skip_keywords = model.no_weight_decay_keywords() + logger.info(f'No weight decay keywords: {skip_keywords}') + + parameters = get_pretrain_param_groups(model, logger, skip, skip_keywords) + + opt_lower = config.TRAIN.OPTIMIZER.NAME.lower() + optimizer = None + if opt_lower == 'sgd': + optimizer = optim.SGD(parameters, momentum=config.TRAIN.OPTIMIZER.MOMENTUM, nesterov=True, + lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY) + elif opt_lower == 'adamw': + optimizer = optim.AdamW(parameters, eps=config.TRAIN.OPTIMIZER.EPS, betas=config.TRAIN.OPTIMIZER.BETAS, + lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY) + + logger.info(optimizer) + return optimizer + + +def get_pretrain_param_groups(model, logger, skip_list=(), skip_keywords=()): + has_decay = [] + no_decay = [] + has_decay_name = [] + no_decay_name = [] + + for name, param in model.named_parameters(): + if not param.requires_grad: + continue + if len(param.shape) == 1 or name.endswith(".bias") or (name in skip_list) or \ + check_keywords_in_name(name, skip_keywords): + no_decay.append(param) + no_decay_name.append(name) + else: + has_decay.append(param) + has_decay_name.append(name) + logger.info(f'No decay params: {no_decay_name}') + logger.info(f'Has decay params: {has_decay_name}') + return [{'params': has_decay}, + {'params': no_decay, 'weight_decay': 0.}] + + +def build_finetune_optimizer(config, model, logger): + logger.info('>>>>>>>>>> Build Optimizer for Fine-tuning Stage') + if config.MODEL.TYPE == 'swin': + depths = config.MODEL.SWIN.DEPTHS + num_layers = sum(depths) + get_layer_func = partial(get_swin_layer, num_layers=num_layers + 2, depths=depths) + elif config.MODEL.TYPE == 'vit': + num_layers = config.MODEL.VIT.DEPTH + get_layer_func = partial(get_vit_layer, num_layers=num_layers + 2) + else: + raise NotImplementedError + + scales = list(config.TRAIN.LAYER_DECAY ** i for i in reversed(range(num_layers + 2))) + + skip = {} + skip_keywords = {} + if hasattr(model, 'no_weight_decay'): + skip = model.no_weight_decay() + logger.info(f'No weight decay: {skip}') + if hasattr(model, 'no_weight_decay_keywords'): + skip_keywords = model.no_weight_decay_keywords() + logger.info(f'No weight decay keywords: {skip_keywords}') + + parameters = get_finetune_param_groups( + model, logger, config.TRAIN.BASE_LR, config.TRAIN.WEIGHT_DECAY, + get_layer_func, scales, skip, skip_keywords) + + opt_lower = config.TRAIN.OPTIMIZER.NAME.lower() + optimizer = None + if opt_lower == 'sgd': + optimizer = optim.SGD(parameters, momentum=config.TRAIN.OPTIMIZER.MOMENTUM, nesterov=True, + lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY) + elif opt_lower == 'adamw': + optimizer = optim.AdamW(parameters, eps=config.TRAIN.OPTIMIZER.EPS, betas=config.TRAIN.OPTIMIZER.BETAS, + lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY) + + logger.info(optimizer) + return optimizer + + +def get_vit_layer(name, num_layers): + if name in ("cls_token", "mask_token", "pos_embed"): + return 0 + elif name.startswith("patch_embed"): + return 0 + elif name.startswith("rel_pos_bias"): + return num_layers - 1 + elif name.startswith("blocks"): + layer_id = int(name.split('.')[1]) + return layer_id + 1 + else: + return num_layers - 1 + + +def get_swin_layer(name, num_layers, depths): + if name in ("mask_token"): + return 0 + elif name.startswith("patch_embed"): + return 0 + elif name.startswith("layers"): + layer_id = int(name.split('.')[1]) + block_id = name.split('.')[3] + if block_id == 'reduction' or block_id == 'norm': + return sum(depths[:layer_id + 1]) + layer_id = sum(depths[:layer_id]) + int(block_id) + return layer_id + 1 + else: + return num_layers - 1 + + +def get_finetune_param_groups(model, logger, lr, weight_decay, get_layer_func, scales, skip_list=(), skip_keywords=()): + parameter_group_names = {} + parameter_group_vars = {} + + for name, param in model.named_parameters(): + if not param.requires_grad: + continue + if len(param.shape) == 1 or name.endswith(".bias") or (name in skip_list) or \ + check_keywords_in_name(name, skip_keywords): + group_name = "no_decay" + this_weight_decay = 0. + else: + group_name = "decay" + this_weight_decay = weight_decay + if get_layer_func is not None: + layer_id = get_layer_func(name) + group_name = "layer_%d_%s" % (layer_id, group_name) + else: + layer_id = None + + if group_name not in parameter_group_names: + if scales is not None: + scale = scales[layer_id] + else: + scale = 1. + + parameter_group_names[group_name] = { + "group_name": group_name, + "weight_decay": this_weight_decay, + "params": [], + "lr": lr * scale, + "lr_scale": scale, + } + parameter_group_vars[group_name] = { + "group_name": group_name, + "weight_decay": this_weight_decay, + "params": [], + "lr": lr * scale, + "lr_scale": scale + } + + parameter_group_vars[group_name]["params"].append(param) + parameter_group_names[group_name]["params"].append(name) + logger.info("Param groups = %s" % json.dumps(parameter_group_names, indent=2)) + return list(parameter_group_vars.values()) + + +def check_keywords_in_name(name, keywords=()): + isin = False + for keyword in keywords: + if keyword in name: + isin = True + return isin \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/pretrain.sh b/PuzzleTuning/Counterpart PreTrain Methods/simmim/pretrain.sh new file mode 100644 index 0000000000000000000000000000000000000000..b098d5ea6c919fce5a610025170b65f6a48966c5 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/pretrain.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# ps -ef | grep simmim | awk '{print $2}' |xargs kill + +# Training settings +pretrain_model="timm" +dataset="All" +model_weights="/root/autodl-tmp/model_base/ViT_b16_224_Imagenet.pth" + +# Init params +epochs=10 +data_path="/root/autodl-tmp/datasets/${dataset}" +model_name="ViT_b16_224_${pretrain_model}_sdmae_${dataset}_${epochs}" +checkpoint_path="/root/autodl-tmp/LSQ-simmim/checkpoint/" +save_weight_path="/root/autodl-tmp/LSQ-simmim/model_saved/" +tensorboard_path="/root/tf-logs/" + + +# Training. Save checkpoint every 10 epochs. +# The checkpoint and backbone model will be available under checkpoint_path folder. +set -e + +CUDA_VISIBLE_DEVICES=0,1,2,3 \ +python -u -m torch.distributed.launch \ + --nproc_per_node 4 \ + main_simmim.py \ + --tag vit_simmim \ + --cfg ./configs/vit_base__test/simmim_pretrain__vit_base__img224__100ep.yaml \ + --batch-size 128 \ + --data-path $data_path \ + --output $checkpoint_path \ + --log_dir $tensorboard_path \ + --amp-opt-level O1 \ + --load-weight $model_weights + +python load_vit_from_ckpt.py \ + --checkpoint /root/autodl-tmp/LSQ-simmim/B/checkpoint/simmim_pretrain/vit_simmim/ckpt_epoch_199.pth \ + --save-to ./output/ \ + --save-name "ViT_b16_224_timm_SIMMIM_ALL_200.pth" \ + --basic-weight $model_weights \ + --num-classes 2 + +set +e \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/requirements.txt b/PuzzleTuning/Counterpart PreTrain Methods/simmim/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..16ebf2cdf2af6b8ae1d6cf4ed81ace18978bfda3 --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/requirements.txt @@ -0,0 +1,5 @@ +pyyaml +scipy +termcolor +timm +yacs \ No newline at end of file diff --git a/PuzzleTuning/Counterpart PreTrain Methods/simmim/utils.py b/PuzzleTuning/Counterpart PreTrain Methods/simmim/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4b902fb75b720c78be6346553d1dbaffa4bf856d --- /dev/null +++ b/PuzzleTuning/Counterpart PreTrain Methods/simmim/utils.py @@ -0,0 +1,286 @@ +# -------------------------------------------------------- +# SimMIM +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu +# Modified by Zhenda Xie +# -------------------------------------------------------- + +import os +import torch +import torch.distributed as dist +import numpy as np +from scipy import interpolate + +# try: +# # noinspection PyUnresolvedReferences +# # from apex import amp +# import torch.cuda.amp as amp +# from torch.cuda.amp import autocast as autocast +# except ImportError: +# amp = None + + +def load_checkpoint(config, model, optimizer, lr_scheduler, logger): + logger.info(f">>>>>>>>>> Resuming from {config.MODEL.RESUME} ..........") + if config.MODEL.RESUME.startswith('https'): + checkpoint = torch.hub.load_state_dict_from_url( + config.MODEL.RESUME, map_location='cpu', check_hash=True) + else: + checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu') + msg = model.load_state_dict(checkpoint['model'], strict=False) + logger.info(msg) + max_accuracy = 0.0 + if not config.EVAL_MODE and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: + optimizer.load_state_dict(checkpoint['optimizer']) + lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) + config.defrost() + config.TRAIN.START_EPOCH = checkpoint['epoch'] + 1 + config.freeze() + # if 'amp' in checkpoint and config.AMP_OPT_LEVEL != "O0" and checkpoint['config'].AMP_OPT_LEVEL != "O0": + # amp.load_state_dict(checkpoint['amp']) + logger.info(f"=> loaded successfully '{config.MODEL.RESUME}' (epoch {checkpoint['epoch']})") + if 'max_accuracy' in checkpoint: + max_accuracy = checkpoint['max_accuracy'] + + del checkpoint + torch.cuda.empty_cache() + return max_accuracy + + +def save_checkpoint(config, epoch, model, max_accuracy, optimizer, lr_scheduler, logger): + save_state = {'model': model.state_dict(), + 'optimizer': optimizer.state_dict(), + 'lr_scheduler': lr_scheduler.state_dict(), + 'max_accuracy': max_accuracy, + 'epoch': epoch, + 'config': config} + # if config.AMP_OPT_LEVEL != "O0": + # save_state['amp'] = amp.state_dict() + + save_path = os.path.join(config.OUTPUT, f'ckpt_epoch_{epoch}.pth') + logger.info(f"{save_path} saving......") + torch.save(save_state, save_path) + logger.info(f"{save_path} saved !!!") + + +def get_grad_norm(parameters, norm_type=2): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + total_norm = total_norm ** (1. / norm_type) + return total_norm + + +def auto_resume_helper(output_dir, logger): + checkpoints = os.listdir(output_dir) + checkpoints = [ckpt for ckpt in checkpoints if ckpt.endswith('pth')] + logger.info(f"All checkpoints founded in {output_dir}: {checkpoints}") + if len(checkpoints) > 0: + latest_checkpoint = max([os.path.join(output_dir, d) for d in checkpoints], key=os.path.getmtime) + logger.info(f"The latest checkpoint founded: {latest_checkpoint}") + resume_file = latest_checkpoint + else: + resume_file = None + return resume_file + + +def reduce_tensor(tensor): + rt = tensor.clone() + dist.all_reduce(rt, op=dist.ReduceOp.SUM) + rt /= dist.get_world_size() + return rt + + +def load_pretrained(config, model, logger): + logger.info(f">>>>>>>>>> Fine-tuned from {config.PRETRAINED} ..........") + checkpoint = torch.load(config.PRETRAINED, map_location='cpu') + checkpoint_model = checkpoint['model'] + + if any([True if 'encoder.' in k else False for k in checkpoint_model.keys()]): + checkpoint_model = {k.replace('encoder.', ''): v for k, v in checkpoint_model.items() if k.startswith('encoder.')} + logger.info('Detect pre-trained model, remove [encoder.] prefix.') + else: + logger.info('Detect non-pre-trained model, pass without doing anything.') + + if config.MODEL.TYPE == 'swin': + logger.info(f">>>>>>>>>> Remapping pre-trained keys for SWIN ..........") + checkpoint = remap_pretrained_keys_swin(model, checkpoint_model, logger) + elif config.MODEL.TYPE == 'vit': + logger.info(f">>>>>>>>>> Remapping pre-trained keys for VIT ..........") + checkpoint = remap_pretrained_keys_vit(model, checkpoint_model, logger) + else: + raise NotImplementedError + + msg = model.load_state_dict(checkpoint_model, strict=False) + logger.info(msg) + + del checkpoint + torch.cuda.empty_cache() + logger.info(f">>>>>>>>>> loaded successfully '{config.PRETRAINED}'") + + +def remap_pretrained_keys_swin(model, checkpoint_model, logger): + state_dict = model.state_dict() + + # Geometric interpolation when pre-trained patch size mismatch with fine-tuned patch size + all_keys = list(checkpoint_model.keys()) + for key in all_keys: + if "relative_position_bias_table" in key: + relative_position_bias_table_pretrained = checkpoint_model[key] + relative_position_bias_table_current = state_dict[key] + L1, nH1 = relative_position_bias_table_pretrained.size() + L2, nH2 = relative_position_bias_table_current.size() + if nH1 != nH2: + logger.info(f"Error in loading {key}, passing......") + else: + if L1 != L2: + logger.info(f"{key}: Interpolate relative_position_bias_table using geo.") + src_size = int(L1 ** 0.5) + dst_size = int(L2 ** 0.5) + + def geometric_progression(a, r, n): + return a * (1.0 - r ** n) / (1.0 - r) + + left, right = 1.01, 1.5 + while right - left > 1e-6: + q = (left + right) / 2.0 + gp = geometric_progression(1, q, src_size // 2) + if gp > dst_size // 2: + right = q + else: + left = q + + # if q > 1.090307: + # q = 1.090307 + + dis = [] + cur = 1 + for i in range(src_size // 2): + dis.append(cur) + cur += q ** (i + 1) + + r_ids = [-_ for _ in reversed(dis)] + + x = r_ids + [0] + dis + y = r_ids + [0] + dis + + t = dst_size // 2.0 + dx = np.arange(-t, t + 0.1, 1.0) + dy = np.arange(-t, t + 0.1, 1.0) + + logger.info("Original positions = %s" % str(x)) + logger.info("Target positions = %s" % str(dx)) + + all_rel_pos_bias = [] + + for i in range(nH1): + z = relative_position_bias_table_pretrained[:, i].view(src_size, src_size).float().numpy() + f_cubic = interpolate.interp2d(x, y, z, kind='cubic') + all_rel_pos_bias.append(torch.Tensor(f_cubic(dx, dy)).contiguous().view(-1, 1).to( + relative_position_bias_table_pretrained.device)) + + new_rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) + checkpoint_model[key] = new_rel_pos_bias + + # delete relative_position_index since we always re-init it + relative_position_index_keys = [k for k in checkpoint_model.keys() if "relative_position_index" in k] + for k in relative_position_index_keys: + del checkpoint_model[k] + + # delete relative_coords_table since we always re-init it + relative_coords_table_keys = [k for k in checkpoint_model.keys() if "relative_coords_table" in k] + for k in relative_coords_table_keys: + del checkpoint_model[k] + + # delete attn_mask since we always re-init it + attn_mask_keys = [k for k in checkpoint_model.keys() if "attn_mask" in k] + for k in attn_mask_keys: + del checkpoint_model[k] + + return checkpoint_model + + +def remap_pretrained_keys_vit(model, checkpoint_model, logger): + # Duplicate shared rel_pos_bias to each layer + if getattr(model, 'use_rel_pos_bias', False) and "rel_pos_bias.relative_position_bias_table" in checkpoint_model: + logger.info("Expand the shared relative position embedding to each transformer block.") + num_layers = model.get_num_layers() + rel_pos_bias = checkpoint_model["rel_pos_bias.relative_position_bias_table"] + for i in range(num_layers): + checkpoint_model["blocks.%d.attn.relative_position_bias_table" % i] = rel_pos_bias.clone() + checkpoint_model.pop("rel_pos_bias.relative_position_bias_table") + + # Geometric interpolation when pre-trained patch size mismatch with fine-tuned patch size + all_keys = list(checkpoint_model.keys()) + for key in all_keys: + if "relative_position_index" in key: + checkpoint_model.pop(key) + + if "relative_position_bias_table" in key: + rel_pos_bias = checkpoint_model[key] + src_num_pos, num_attn_heads = rel_pos_bias.size() + dst_num_pos, _ = model.state_dict()[key].size() + dst_patch_shape = model.patch_embed.patch_shape + if dst_patch_shape[0] != dst_patch_shape[1]: + raise NotImplementedError() + num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * (dst_patch_shape[1] * 2 - 1) + src_size = int((src_num_pos - num_extra_tokens) ** 0.5) + dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5) + if src_size != dst_size: + logger.info("Position interpolate for %s from %dx%d to %dx%d" % (key, src_size, src_size, dst_size, dst_size)) + extra_tokens = rel_pos_bias[-num_extra_tokens:, :] + rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] + + def geometric_progression(a, r, n): + return a * (1.0 - r ** n) / (1.0 - r) + + left, right = 1.01, 1.5 + while right - left > 1e-6: + q = (left + right) / 2.0 + gp = geometric_progression(1, q, src_size // 2) + if gp > dst_size // 2: + right = q + else: + left = q + + # if q > 1.090307: + # q = 1.090307 + + dis = [] + cur = 1 + for i in range(src_size // 2): + dis.append(cur) + cur += q ** (i + 1) + + r_ids = [-_ for _ in reversed(dis)] + + x = r_ids + [0] + dis + y = r_ids + [0] + dis + + t = dst_size // 2.0 + dx = np.arange(-t, t + 0.1, 1.0) + dy = np.arange(-t, t + 0.1, 1.0) + + logger.info("Original positions = %s" % str(x)) + logger.info("Target positions = %s" % str(dx)) + + all_rel_pos_bias = [] + + for i in range(num_attn_heads): + z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy() + f = interpolate.interp2d(x, y, z, kind='cubic') + all_rel_pos_bias.append( + torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device)) + + rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) + + new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0) + checkpoint_model[key] = new_rel_pos_bias + + return checkpoint_model \ No newline at end of file diff --git a/PuzzleTuning/Get_PuzzleTuning_model.py b/PuzzleTuning/Get_PuzzleTuning_model.py new file mode 100644 index 0000000000000000000000000000000000000000..9bb5ba6b944c483a022df9fb6640aa7f27347055 --- /dev/null +++ b/PuzzleTuning/Get_PuzzleTuning_model.py @@ -0,0 +1,25 @@ +import timm +import os +from Backbone.GetPromptModel import build_promptmodel +from pprint import pprint + + +def get_PuzzleTuning_VPT_model(num_classes=0, edge_size=224, prompt_state_dict=None, base_state_dict='timm'): + """ + :param num_classes: classification required number of your dataset, 0 for taking the feature + :param edge_size: the input edge size of the dataloder + :param model_idx: the model we are going to use. by the format of Model_size_other_info + + :param pretrained_backbone: The backbone CNN is initiate randomly or by its official Pretrained models + + :return: prepared model + """ + + model = build_promptmodel( + num_classes=0, # set to feature extractor model, output is CLS token + edge_size=edge_size, model_idx='ViT', patch_size=16, + Prompt_Token_num=20, VPT_type="Deep", + prompt_state_dict=prompt_state_dict, + base_state_dict=base_state_dict) + + return model diff --git a/PuzzleTuning/PuzzleTesting.py b/PuzzleTuning/PuzzleTesting.py new file mode 100644 index 0000000000000000000000000000000000000000..dde514476e469d15f8c3a4aed2ce1a39d688f3fb --- /dev/null +++ b/PuzzleTuning/PuzzleTesting.py @@ -0,0 +1,369 @@ +""" +Testing script of PuzzleTuning Visualization Script ver: Feb 11th 14:00 + +Paper: +https://arxiv.org/abs/2311.06712 +Code: +https://github.com/sagizty/PuzzleTuning +Ref: MAE +https://github.com/facebookresearch/mae + +Step 1: PreTraining on the ImagetNet-1k style dataset (others) +Step 2: Domain Prompt Tuning (PuzzleTuning) on Pathological Images (in ImageFolder) +Step 3: FineTuning on the Downstream Tasks + +This is the independent testing for step 2 + + +update: +Use "--seg_decoder" parameter to introduce segmentation networks +swin_unet for Swin-Unet +""" + +import argparse +import datetime +import numpy as np +import os +import time +from pathlib import Path + +import torch +import torch.backends.cudnn as cudnn +from tensorboardX import SummaryWriter +import torchvision.transforms as transforms +import torchvision.datasets as datasets + +import timm + +# assert timm.__version__ == "0.3.2" # version check + +from SSL_structures import models_mae, SAE + +from utils.visual_usage import patchify, unpatchify, Draw_tri_fig +from torchvision.transforms import ToPILImage + + +def Puzzle_test(model, data_loader_test, test_dataset_size, mask_ratio, fix_position_ratio, fix_patch_size, + check_minibatch=100, enable_visualize_check=True, combined_pred_illustration=False, check_samples=1, + device=None, output_dir=None, writer=None, args=None): + # start testing + print(f"Start testing for {args.model_idx} \n with checkpoint: {args.checkpoint_path}") + start_time = time.time() + index = 0 + model_time = time.time() + # criterias, initially empty + running_loss = 0.0 + log_running_loss = 0.0 + + model.eval() + + # Iterate over data. + for inputs, labels in data_loader_test: # use different dataloder in different phase + inputs = inputs.to(device, non_blocking=True) + labels = labels.to(device, non_blocking=True) # for tracking fixme + + if args.model[0:3] == 'sae': + loss, pred, imgs_puzzled_patches = model(inputs, fix_position_ratio=fix_position_ratio, + puzzle_patch_size=fix_patch_size, + combined_pred_illustration=combined_pred_illustration) # SAE + else: # args.model[0:3] == 'mae' + loss, pred, mask_patch_indicators = model(inputs, mask_ratio=mask_ratio) # MAE + + loss_value = float(loss.cpu().detach().numpy()) if args.gpu == 1 else sum(loss.cpu().detach().numpy()) + # log criterias: update + log_running_loss += loss_value + running_loss += loss_value * inputs.size(0) + + # attach the records to the tensorboard backend + if writer is not None: + # ...log the running loss + writer.add_scalar('Test minibatch loss', + float(loss_value), + index) + + # at the checking time now + if index % check_minibatch == check_minibatch - 1: + model_time = time.time() - model_time + + check_index = index // check_minibatch + 1 + + print('Test index ' + str(check_index) + ' of ' + str(check_minibatch) + ' minibatch with batch_size of ' + + str(inputs.size(0)) + ' time used:', model_time) + print('minibatch AVG loss:', float(log_running_loss) / check_minibatch) + + model_time = time.time() + log_running_loss = 0.0 + + # paint pic + if enable_visualize_check: + if args.model[0:3] == 'sae': + imgs_puzzled_batch = unpatchify(imgs_puzzled_patches, patch_size=16) + # Reconstructed img + recons_img_batch = unpatchify(pred, patch_size=16) + + else: # MAE + sample_img_patches = patchify(inputs, patch_size=16) # on GPU + masked_img_patches = sample_img_patches * mask_patch_indicators.unsqueeze(-1).expand(-1, -1, + sample_img_patches.shape[-1]) + masked_img_batch = unpatchify(masked_img_patches, patch_size=16) + + if combined_pred_illustration: + + anti_mask_patch_indicators = 1 - mask_patch_indicators + pred_img_patches = pred * anti_mask_patch_indicators.unsqueeze(-1).\ + expand(-1, -1, sample_img_patches.shape[-1]) + + # Reconstructed img + recons_img_batch = unpatchify(masked_img_patches + pred_img_patches, patch_size=16) + else: + # Reconstructed img + recons_img_batch = unpatchify(pred, patch_size=16) + + for sampleIDX in range(check_samples): + # Ori img + sample_img = inputs.cpu()[sampleIDX] + sample_img = ToPILImage()(sample_img) + sample_img.save(os.path.join(output_dir, 'Test_sample_idx_' + str(check_index) + + '_sampleIDX_' + str(sampleIDX) + '.jpg')) + + recons_img = recons_img_batch.cpu()[sampleIDX] + recons_img = ToPILImage()(recons_img) + recons_img.save(os.path.join(output_dir, 'Test_recons_idx_' + str(check_index) + + '_sampleIDX_' + str(sampleIDX) + '.jpg')) + + # mask_img or puzzled_img + if args.model[0:3] == 'sae': + puzzled_img = imgs_puzzled_batch.cpu()[sampleIDX] + puzzled_img = ToPILImage()(puzzled_img) + puzzled_img.save(os.path.join(output_dir, 'Test_puzzled_idx_' + str(check_index) + '.jpg')) + + picpath = os.path.join(output_dir, 'Test_minibatchIDX_' + str(check_index) + + '_sampleIDX_' + str(sampleIDX) + '.jpg') + Draw_tri_fig(sample_img, puzzled_img, recons_img, picpath) + + else: # MAE + masked_img = masked_img_batch.cpu()[sampleIDX] + masked_img = ToPILImage()(masked_img) + masked_img.save(os.path.join(output_dir, 'Test_masked_idx_' + str(check_index) + '.jpg')) + + picpath = os.path.join(output_dir, 'Test_minibatchIDX_' + str(check_index) + + '_sampleIDX_' + str(sampleIDX) + '.jpg') + Draw_tri_fig(sample_img, masked_img, recons_img, picpath) + + index += 1 + + # time stamp + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + + # log criterias: print + epoch_loss = running_loss / test_dataset_size + print('\nTest_dataset_size: {} \nAvg Loss: {:.4f}'.format(test_dataset_size, epoch_loss)) + print('Testing time {}'.format(total_time_str)) + + +def main(args): + # choose decoder version + args.model = args.model + '_decoder' if args.seg_decoder is not None else args.model + # note decoder + args.model_idx = args.model_idx + args.model + '_' + args.seg_decoder if args.seg_decoder is not None \ + else args.model_idx + args.model + # note PromptTuning + args.model_idx = args.model_idx + '_Prompt_' + args.PromptTuning + '_tokennum_' + str(args.Prompt_Token_num) \ + if args.PromptTuning is not None else args.model_idx + + # Specify the Test settings + if args.fix_position_ratio is not None and args.fix_patch_size is not None and args.mask_ratio is None: + args.model_idx = 'Testing_' + args.model_idx + '_b_' + str(args.batch_size) \ + + '_hint_ratio_' + str(args.fix_position_ratio) + '_patch_size_' + str(args.fix_patch_size) + elif args.mask_ratio is not None and args.fix_position_ratio is None and args.fix_patch_size is None: + args.model_idx = 'Testing_' + args.model_idx + '_b_' + str(args.batch_size) \ + + '_mask_ratio_' + str(args.mask_ratio) + else: + print('not a correct test setting, should correctly specify fix_position_ratio/fix_patch_size/mask_ratio') + + print('\n\n' + args.model_idx + '\n\n') + + # setting k for: only card idx k is sighted for this code + if args.gpu_idx != -1: # fixme: notice for test, we are going to use single gpu only + print("Use", torch.cuda.device_count(), "GPUs of idx:", args.gpu_idx) + os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_idx) + else: + print("Use", torch.cuda.device_count(), "GPUs") + args.gpu = torch.cuda.device_count() + + print('job AImageFolderDir: {}'.format(os.path.dirname(os.path.realpath(__file__)))) + print("{}".format(args).replace(', ', ',\n')) + + device = torch.device(args.device) # cuda + + # fix the seed for reproducibility + torch.manual_seed(args.seed) + np.random.seed(args.seed) + + cudnn.benchmark = True + + # simple augmentation + transform_test = transforms.Compose([ + # transforms.RandomResizedCrop(args.input_size, scale=(0.8, 1.0), interpolation=3, ratio=(1. / 1., 1. / 1.)), + # 3 is bicubic + transforms.Resize(args.input_size), + transforms.ToTensor(), + # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ]) + + test_dataroot = os.path.join(args.data_path) # , 'test' + dataset_test = datasets.ImageFolder(test_dataroot, transform=transform_test) + test_dataset_size = len(dataset_test) + class_names = [d.name for d in os.scandir(test_dataroot) if d.is_dir()] + class_names.sort() + + print('dataset_test', dataset_test) # Test data + + # skip minibatch, none to draw 80 figs + check_minibatch = args.check_minibatch if args.check_minibatch is not None else \ + test_dataset_size // (80 * args.batch_size) + check_minibatch = max(1, check_minibatch) + + # outputs + if args.log_dir is not None: + args.log_dir = os.path.join(args.log_dir, args.model_idx) + os.makedirs(args.log_dir, exist_ok=True) + log_writer = SummaryWriter(log_dir=args.log_dir) # Tensorboard + else: + log_writer = None + + # output_dir + if args.output_dir is not None: + args.output_dir = os.path.join(args.output_dir, args.model_idx) + os.makedirs(args.output_dir, exist_ok=True) + print('Testing output files will be at', args.output_dir) + + data_loader_test = torch.utils.data.DataLoader(dataset_test, + shuffle=args.shuffle_dataloader, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=args.pin_mem, # 建议False + drop_last=True) + + # define the model + if args.model[0:3] == 'mae': + model = models_mae.__dict__[args.model](img_size=args.input_size, norm_pix_loss=args.norm_pix_loss, + prompt_mode=args.PromptTuning, Prompt_Token_num=args.Prompt_Token_num, + dec_idx=args.seg_decoder) + + elif args.model[0:3] == 'sae': + model = SAE.__dict__[args.model](img_size=args.input_size, group_shuffle_size=args.group_shuffle_size, + norm_pix_loss=args.norm_pix_loss, prompt_mode=args.PromptTuning, + Prompt_Token_num=args.Prompt_Token_num, dec_idx=args.seg_decoder) + + else: + print('This MIM test script only support SAE or MAE') + return -1 + + # take model out of checkpoint and load_model + state_dict = torch.load(args.checkpoint_path)['model'] + model.load_state_dict(state_dict, False) + model.to(device) + + # loss backward and optimizer operations and no longer needed in testing + # loss_scaler = NativeScaler() + + Puzzle_test(model, data_loader_test, test_dataset_size, + args.mask_ratio, args.fix_position_ratio, args.fix_patch_size, + check_minibatch, args.enable_visualize_check, args.combined_pred_illustration, args.check_samples, + device=device, output_dir=args.output_dir, writer=log_writer, args=args) + + # os.system("shutdown") # AUTO-DL server shutdown currently moved to .sh script for nohup task queue. + + +def get_args_parser(): + parser = argparse.ArgumentParser('MIM visualization for PuzzleTuning', add_help=False) + + # Model Name or index + parser.add_argument('--model_idx', default='PuzzleTuning_', type=str, help='Model Name or index') + + # testing batch size + parser.add_argument('--batch_size', default=16, type=int, + help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus') + + # Model parameters sae_vit_base_patch16 or mae_vit_base_patch16 + parser.add_argument('--model', default='sae_vit_base_patch16', type=str, metavar='MODEL', + help='Name of model to train') # ori mae_vit_large_patch16 + parser.add_argument('--seg_decoder', default=None, type=str, metavar='segmentation decoder', + help='Name of segmentation decoder') + + parser.add_argument('--input_size', default=224, type=int, # 原224 + help='images input size') + parser.add_argument('--num_classes', default=3, type=int, # decoder seg class set to channel + help='the number of classes for segmentation') + + # MAE mask_ratio + parser.add_argument('--mask_ratio', default=None, type=float, + help='Masking ratio (percentage of removed patches).') + # Hint tokens + parser.add_argument('--fix_position_ratio', default=None, type=float, + help='basic fix_position_ratio (percentage of position token patches).') + parser.add_argument('--fix_patch_size', default=None, type=int, # 原224 + help='images input size') + parser.add_argument('--group_shuffle_size', default=-1, type=int, help='group_shuffle_size of group shuffling,' + 'default -1 for the whole batch as a group') + # shuffle_dataloader + parser.add_argument('--shuffle_dataloader', action='store_true', help='shuffle Test dataset') + + # Tuning setting + # PromptTuning + parser.add_argument('--PromptTuning', default=None, type=str, + help='Deep/Shallow to use Prompt Tuning model instead of Finetuning model, by default None') + # Prompt_Token_num + parser.add_argument('--Prompt_Token_num', default=20, type=int, help='Prompt_Token_num') + # loss settings + parser.add_argument('--norm_pix_loss', action='store_true', + help='Use (per-patch) normalized pixels as targets for computing loss') + parser.set_defaults(norm_pix_loss=False) + + # PATH settings + # Dataset parameters /root/autodl-tmp/MARS_ALL /root/autodl-tmp/imagenet /root/autodl-tmp/datasets/All + parser.add_argument('--data_path', default='/root/autodl-tmp/datasets/PuzzleTuning_demoset', type=str, + help='dataset path') + parser.add_argument('--output_dir', default='/root/autodl-tmp/runs', + help='path where to save test log, empty for no saving') + parser.add_argument('--log_dir', default='/root/tf-logs', + help='path where to test tensorboard log') + + # Enviroment parameters + parser.add_argument('--gpu_idx', default=0, type=int, + help='use a single GPU with its index, -1 to use multiple GPU') + parser.add_argument('--device', default='cuda', + help='device to use for training / testing') + parser.add_argument('--seed', default=42, type=int) # ori 0 不过应该无所谓? + + # checkpoint_state_dict_path + parser.add_argument('--checkpoint_path', + default='/root/autodl-tmp/runs/PuzzleTuning_SAE_vit_base_patch16_Prompt_Deep_tokennum_20_tr_timm_CPIAm/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20_checkpoint-199.pth', + type=str, help='load state_dict for testing') + + # check settings + parser.add_argument('--combined_pred_illustration', action='store_true', help='check combined_pred_illustration pics') + parser.add_argument('--enable_visualize_check', action='store_true', help='check and save pics') + parser.add_argument('--check_minibatch', default=None, type=int, help='check batch_size') + parser.add_argument('--check_samples', default=1, type=int, help='check how many images in a checking batch') + + # dataloader setting + parser.add_argument('--num_workers', default=10, type=int) + parser.add_argument('--pin_mem', action='store_true', + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') + parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') + parser.set_defaults(pin_mem=True) + + return parser + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + + if args.output_dir: + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + + main(args) \ No newline at end of file diff --git a/PuzzleTuning/PuzzleTuning Colab Demo.ipynb b/PuzzleTuning/PuzzleTuning Colab Demo.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..57dc6295af259577730844482765eef26cc98180 --- /dev/null +++ b/PuzzleTuning/PuzzleTuning Colab Demo.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","metadata":{"id":"-1_HUut4YYm5"},"source":["# This is the official training Illustration of PuzzleTuning\n","* Use google colab pro+ (high RAM+GPU) to run 24 hours\n","* we use the Python3.7 Pytorch 1.9.0+cu111 torchvision 0.10.0+cu111\n","* we use the A100 GPU for the data-flow illustration with Colab\n","\n","The code and Training process along with all record are Open-Source:\n","* PuzzleTuning official github page: https://github.com/sagizty/PuzzleTuning\n","* The dataset CPIA is publicly aviliable at: https://github.com/zhanglab2021/CPIA_Dataset\n"]},{"cell_type":"markdown","metadata":{"id":"dzCoT1IxZ-1B"},"source":["## Check Colab GPU"]},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":751,"status":"ok","timestamp":1700807513684,"user":{"displayName":"Tianyi Zhang","userId":"06202607434029765461"},"user_tz":-480},"id":"ZnbrNSoSXFm5","outputId":"0aa1b390-69f7-4e80-c03e-e1a12a35d2f2"},"outputs":[{"name":"stdout","output_type":"stream","text":["Fri Nov 24 06:31:52 2023 \n","+-----------------------------------------------------------------------------+\n","| NVIDIA-SMI 525.105.17 Driver Version: 525.105.17 CUDA Version: 12.0 |\n","|-------------------------------+----------------------+----------------------+\n","| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n","| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n","| | | MIG M. |\n","|===============================+======================+======================|\n","| 0 NVIDIA A100-SXM... Off | 00000000:00:04.0 Off | 0 |\n","| N/A 34C P0 55W / 400W | 0MiB / 40960MiB | 0% Default |\n","| | | Disabled |\n","+-------------------------------+----------------------+----------------------+\n"," \n","+-----------------------------------------------------------------------------+\n","| Processes: |\n","| GPU GI CI PID Type Process name GPU Memory |\n","| ID ID Usage |\n","|=============================================================================|\n","| No running processes found |\n","+-----------------------------------------------------------------------------+\n"]}],"source":["# check GPU\n","!nvidia-smi"]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":10,"status":"ok","timestamp":1700807513684,"user":{"displayName":"Tianyi Zhang","userId":"06202607434029765461"},"user_tz":-480},"id":"n9GPOn5gcykA","outputId":"45658563-b504-449a-e204-5f2a1f14b959"},"outputs":[{"name":"stdout","output_type":"stream","text":["Fri Nov 24 02:31:52 PM UTC 2023\n"]}],"source":["!date --date='+8 hour' # CST time zone"]},{"cell_type":"markdown","metadata":{"id":"fbnpeHYUgsJz"},"source":["## Mount Google Drive"]},{"cell_type":"markdown","metadata":{"id":"ixynw_V1ZqqI"},"source":["This will save output images to your google drive, you can remove this line and the last part if you don't want the output"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"3obRNrIaffjK"},"outputs":[],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"markdown","metadata":{"id":"BYevYeMFYmlx"},"source":["## Build file-system enviroment"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ePtQFcQCEPlu"},"outputs":[],"source":["# create file-system enviroment\n","# mount the google drive first\n","# https://drive.google.com/drive/u/1/my-drive\n","\n","# clear colab path\n","!rm -rf /data\n","!rm -rf /home/Pathology_Experiment\n","\n","# create path\n","!mkdir /home/Pathology_Experiment\n","!mkdir /home/Pathology_Experiment/runs\n","!mkdir /home/Pathology_Experiment/code\n","!mkdir /home/Pathology_Experiment/saved_models\n","!mkdir /home/Pathology_Experiment/imaging_results\n","\n","!mkdir /data\n","!mkdir /data/Pathology_Experiment\n","!mkdir /data/Pathology_Experiment/dataset\n","\n","print('Folder Tree Creation completed!')\n","\n","# get latest code from Github pancreatic-cancer-diagnosis-tansformer page\n","!git clone https://github.com/sagizty/PuzzleTuning.git /home/Pathology_Experiment/code\n","print('code transfer from github completed!')\n","\n","# get the CLS dataset by its zip\n","!mv /home/Pathology_Experiment/code/Archive/* /data/Pathology_Experiment/dataset/\n","# unzip\n","!unzip -q /data/Pathology_Experiment/dataset/PuzzleTuning_demoset.zip -d /data/Pathology_Experiment/dataset/\n","!unzip -q /data/Pathology_Experiment/dataset/warwick_CLS.zip -d /data/Pathology_Experiment/dataset/\n","# alter the path\n","!rm -f /data/Pathology_Experiment/dataset/PuzzleTuning_demoset.zip\n","!rm -f /data/Pathology_Experiment/dataset/warwick_CLS.zip\n","print('data transfer completed!')"]},{"cell_type":"markdown","metadata":{"id":"xLxxHGq_wwwL"},"source":["## Arrange the working enviorment"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"K1Yb2b6TGF4r"},"outputs":[{"name":"stdout","output_type":"stream","text":["Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (4.66.1)\n","Collecting timm==0.5.4\n"," Downloading timm-0.5.4-py3-none-any.whl (431 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m431.5/431.5 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: torch\u003e=1.4 in /usr/local/lib/python3.10/dist-packages (from timm==0.5.4) (2.1.0+cu118)\n","Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from timm==0.5.4) (0.16.0+cu118)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch\u003e=1.4-\u003etimm==0.5.4) (3.13.1)\n","Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch\u003e=1.4-\u003etimm==0.5.4) (4.5.0)\n","Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch\u003e=1.4-\u003etimm==0.5.4) (1.12)\n","Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch\u003e=1.4-\u003etimm==0.5.4) (3.2.1)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch\u003e=1.4-\u003etimm==0.5.4) (3.1.2)\n","Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch\u003e=1.4-\u003etimm==0.5.4) (2023.6.0)\n","Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch\u003e=1.4-\u003etimm==0.5.4) (2.1.0)\n","Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from torchvision-\u003etimm==0.5.4) (1.23.5)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from torchvision-\u003etimm==0.5.4) (2.31.0)\n","Requirement already satisfied: pillow!=8.3.*,\u003e=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision-\u003etimm==0.5.4) (9.4.0)\n","Requirement already satisfied: MarkupSafe\u003e=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2-\u003etorch\u003e=1.4-\u003etimm==0.5.4) (2.1.3)\n","Requirement already satisfied: charset-normalizer\u003c4,\u003e=2 in /usr/local/lib/python3.10/dist-packages (from requests-\u003etorchvision-\u003etimm==0.5.4) (3.3.2)\n","Requirement already satisfied: idna\u003c4,\u003e=2.5 in /usr/local/lib/python3.10/dist-packages (from requests-\u003etorchvision-\u003etimm==0.5.4) (3.4)\n","Requirement already satisfied: urllib3\u003c3,\u003e=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests-\u003etorchvision-\u003etimm==0.5.4) (2.0.7)\n","Requirement already satisfied: certifi\u003e=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests-\u003etorchvision-\u003etimm==0.5.4) (2023.7.22)\n","Requirement already satisfied: mpmath\u003e=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy-\u003etorch\u003e=1.4-\u003etimm==0.5.4) (1.3.0)\n","Installing collected packages: timm\n","Successfully installed timm-0.5.4\n","Collecting einops\n"," Downloading einops-0.7.0-py3-none-any.whl (44 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.6/44.6 kB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hInstalling collected packages: einops\n","Successfully installed einops-0.7.0\n","Collecting ml_collections\n"," Downloading ml_collections-0.1.1.tar.gz (77 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n","Requirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from ml_collections) (1.4.0)\n","Requirement already satisfied: PyYAML in /usr/local/lib/python3.10/dist-packages (from ml_collections) (6.0.1)\n","Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from ml_collections) (1.16.0)\n","Requirement already satisfied: contextlib2 in /usr/local/lib/python3.10/dist-packages (from ml_collections) (21.6.0)\n","Building wheels for collected packages: ml_collections\n"," Building wheel for ml_collections (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for ml_collections: filename=ml_collections-0.1.1-py3-none-any.whl size=94507 sha256=b01d6551e051eb94b398f73bab7144889d7ee2a44f392da831939d3029f7119a\n"," Stored in directory: /root/.cache/pip/wheels/7b/89/c9/a9b87790789e94aadcfc393c283e3ecd5ab916aed0a31be8fe\n","Successfully built ml_collections\n","Installing collected packages: ml_collections\n","Successfully installed ml_collections-0.1.1\n","Collecting ttach\n"," Downloading ttach-0.0.3-py3-none-any.whl (9.8 kB)\n","Installing collected packages: ttach\n","Successfully installed ttach-0.0.3\n","Collecting notifyemail\n"," Downloading notifyemail-1.1.1-py3-none-any.whl (19 kB)\n","Installing collected packages: notifyemail\n","Successfully installed notifyemail-1.1.1\n","Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (5.9.5)\n","Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (1.11.3)\n","Requirement already satisfied: numpy\u003c1.28.0,\u003e=1.21.6 in /usr/local/lib/python3.10/dist-packages (from scipy) (1.23.5)\n","Requirement already satisfied: torchsummary in /usr/local/lib/python3.10/dist-packages (1.5.1)\n","Collecting tensorboardX\n"," Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.7/101.7 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from tensorboardX) (1.23.5)\n","Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from tensorboardX) (23.2)\n","Requirement already satisfied: protobuf\u003e=3.20 in /usr/local/lib/python3.10/dist-packages (from tensorboardX) (3.20.3)\n","Installing collected packages: tensorboardX\n","Successfully installed tensorboardX-2.6.2.2\n","Requirement already satisfied: opencv_contrib_python in /usr/local/lib/python3.10/dist-packages (4.8.0.76)\n","Requirement already satisfied: numpy\u003e=1.21.2 in /usr/local/lib/python3.10/dist-packages (from opencv_contrib_python) (1.23.5)\n","Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (3.7.1)\n","Requirement already satisfied: contourpy\u003e=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.2.0)\n","Requirement already satisfied: cycler\u003e=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (0.12.1)\n","Requirement already satisfied: fonttools\u003e=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (4.44.3)\n","Requirement already satisfied: kiwisolver\u003e=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.4.5)\n","Requirement already satisfied: numpy\u003e=1.20 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.23.5)\n","Requirement already satisfied: packaging\u003e=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (23.2)\n","Requirement already satisfied: pillow\u003e=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (9.4.0)\n","Requirement already satisfied: pyparsing\u003e=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (3.1.1)\n","Requirement already satisfied: python-dateutil\u003e=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (2.8.2)\n","Requirement already satisfied: six\u003e=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil\u003e=2.7-\u003ematplotlib) (1.16.0)\n","Requirement already satisfied: ipykernel in /usr/local/lib/python3.10/dist-packages (5.5.6)\n","Requirement already satisfied: ipython-genutils in /usr/local/lib/python3.10/dist-packages (from ipykernel) (0.2.0)\n","Requirement already satisfied: ipython\u003e=5.0.0 in /usr/local/lib/python3.10/dist-packages (from ipykernel) (7.34.0)\n","Requirement already satisfied: traitlets\u003e=4.1.0 in /usr/local/lib/python3.10/dist-packages (from ipykernel) (5.7.1)\n","Requirement already satisfied: jupyter-client in /usr/local/lib/python3.10/dist-packages (from ipykernel) (6.1.12)\n","Requirement already satisfied: tornado\u003e=4.2 in /usr/local/lib/python3.10/dist-packages (from ipykernel) (6.3.2)\n","Requirement already satisfied: setuptools\u003e=18.5 in /usr/local/lib/python3.10/dist-packages (from ipython\u003e=5.0.0-\u003eipykernel) (67.7.2)\n","Collecting jedi\u003e=0.16 (from ipython\u003e=5.0.0-\u003eipykernel)\n"," Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m24.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: decorator in /usr/local/lib/python3.10/dist-packages (from ipython\u003e=5.0.0-\u003eipykernel) (4.4.2)\n","Requirement already satisfied: pickleshare in /usr/local/lib/python3.10/dist-packages (from ipython\u003e=5.0.0-\u003eipykernel) (0.7.5)\n","Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,\u003c3.1.0,\u003e=2.0.0 in /usr/local/lib/python3.10/dist-packages (from ipython\u003e=5.0.0-\u003eipykernel) (3.0.41)\n","Requirement already satisfied: pygments in /usr/local/lib/python3.10/dist-packages (from ipython\u003e=5.0.0-\u003eipykernel) (2.16.1)\n","Requirement already satisfied: backcall in /usr/local/lib/python3.10/dist-packages (from ipython\u003e=5.0.0-\u003eipykernel) (0.2.0)\n","Requirement already satisfied: matplotlib-inline in /usr/local/lib/python3.10/dist-packages (from ipython\u003e=5.0.0-\u003eipykernel) (0.1.6)\n","Requirement already satisfied: pexpect\u003e4.3 in /usr/local/lib/python3.10/dist-packages (from ipython\u003e=5.0.0-\u003eipykernel) (4.8.0)\n","Requirement already satisfied: jupyter-core\u003e=4.6.0 in /usr/local/lib/python3.10/dist-packages (from jupyter-client-\u003eipykernel) (5.5.0)\n","Requirement already satisfied: pyzmq\u003e=13 in /usr/local/lib/python3.10/dist-packages (from jupyter-client-\u003eipykernel) (23.2.1)\n","Requirement already satisfied: python-dateutil\u003e=2.1 in /usr/local/lib/python3.10/dist-packages (from jupyter-client-\u003eipykernel) (2.8.2)\n","Requirement already satisfied: parso\u003c0.9.0,\u003e=0.8.3 in /usr/local/lib/python3.10/dist-packages (from jedi\u003e=0.16-\u003eipython\u003e=5.0.0-\u003eipykernel) (0.8.3)\n","Requirement already satisfied: platformdirs\u003e=2.5 in /usr/local/lib/python3.10/dist-packages (from jupyter-core\u003e=4.6.0-\u003ejupyter-client-\u003eipykernel) (4.0.0)\n","Requirement already satisfied: ptyprocess\u003e=0.5 in /usr/local/lib/python3.10/dist-packages (from pexpect\u003e4.3-\u003eipython\u003e=5.0.0-\u003eipykernel) (0.7.0)\n","Requirement already satisfied: wcwidth in /usr/local/lib/python3.10/dist-packages (from prompt-toolkit!=3.0.0,!=3.0.1,\u003c3.1.0,\u003e=2.0.0-\u003eipython\u003e=5.0.0-\u003eipykernel) (0.2.10)\n","Requirement already satisfied: six\u003e=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil\u003e=2.1-\u003ejupyter-client-\u003eipykernel) (1.16.0)\n","Installing collected packages: jedi\n","Successfully installed jedi-0.19.1\n"]}],"source":["# get packages\n","!pip install tqdm\n","!pip install timm==0.5.4\n","!pip install einops\n","!pip install ml_collections\n","!pip install ttach\n","!pip install notifyemail\n","!pip install psutil\n","!pip install scipy\n","!pip install torchsummary\n","!pip install tensorboardX\n","!pip install opencv_contrib_python\n","!pip install matplotlib\n","!pip install ipykernel"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"87Owjg_pN2yD"},"outputs":[{"name":"stdout","output_type":"stream","text":["Python 3.10.12\n"]}],"source":["!python --version"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"GpEVUWwqK79D"},"outputs":[{"name":"stdout","output_type":"stream","text":["Package Version\n","-------------------------------- ---------------------\n","absl-py 1.4.0\n","aiohttp 3.8.6\n","aiosignal 1.3.1\n","alabaster 0.7.13\n","albumentations 1.3.1\n","altair 4.2.2\n","anyio 3.7.1\n","appdirs 1.4.4\n","argon2-cffi 23.1.0\n","argon2-cffi-bindings 21.2.0\n","array-record 0.5.0\n","arviz 0.15.1\n","astropy 5.3.4\n","astunparse 1.6.3\n","async-timeout 4.0.3\n","atpublic 4.0\n","attrs 23.1.0\n","audioread 3.0.1\n","autograd 1.6.2\n","Babel 2.13.1\n","backcall 0.2.0\n","beautifulsoup4 4.11.2\n","bidict 0.22.1\n","bigframes 0.13.0\n","bleach 6.1.0\n","blinker 1.4\n","blis 0.7.11\n","blosc2 2.0.0\n","bokeh 3.3.1\n","bqplot 0.12.42\n","branca 0.7.0\n","build 1.0.3\n","CacheControl 0.13.1\n","cachetools 5.3.2\n","catalogue 2.0.10\n","certifi 2023.7.22\n","cffi 1.16.0\n","chardet 5.2.0\n","charset-normalizer 3.3.2\n","chex 0.1.7\n","click 8.1.7\n","click-plugins 1.1.1\n","cligj 0.7.2\n","cloudpickle 2.2.1\n","cmake 3.27.7\n","cmdstanpy 1.2.0\n","colorcet 3.0.1\n","colorlover 0.3.0\n","colour 0.1.5\n","community 1.0.0b1\n","confection 0.1.3\n","cons 0.4.6\n","contextlib2 21.6.0\n","contourpy 1.2.0\n","cryptography 41.0.5\n","cufflinks 0.17.3\n","cupy-cuda11x 11.0.0\n","cvxopt 1.3.2\n","cvxpy 1.3.2\n","cycler 0.12.1\n","cymem 2.0.8\n","Cython 3.0.5\n","dask 2023.8.1\n","datascience 0.17.6\n","db-dtypes 1.1.1\n","dbus-python 1.2.18\n","debugpy 1.6.6\n","decorator 4.4.2\n","defusedxml 0.7.1\n","diskcache 5.6.3\n","distributed 2023.8.1\n","distro 1.7.0\n","dlib 19.24.2\n","dm-tree 0.1.8\n","docutils 0.18.1\n","dopamine-rl 4.0.6\n","duckdb 0.9.2\n","earthengine-api 0.1.379\n","easydict 1.11\n","ecos 2.0.12\n","editdistance 0.6.2\n","eerepr 0.0.4\n","einops 0.7.0\n","en-core-web-sm 3.6.0\n","entrypoints 0.4\n","et-xmlfile 1.1.0\n","etils 1.5.2\n","etuples 0.3.9\n","exceptiongroup 1.1.3\n","fastai 2.7.13\n","fastcore 1.5.29\n","fastdownload 0.0.7\n","fastjsonschema 2.19.0\n","fastprogress 1.0.3\n","fastrlock 0.8.2\n","filelock 3.13.1\n","fiona 1.9.5\n","firebase-admin 5.3.0\n","Flask 2.2.5\n","flatbuffers 23.5.26\n","flax 0.7.5\n","folium 0.14.0\n","fonttools 4.44.3\n","frozendict 2.3.8\n","frozenlist 1.4.0\n","fsspec 2023.6.0\n","future 0.18.3\n","gast 0.5.4\n","gcsfs 2023.6.0\n","GDAL 3.4.3\n","gdown 4.6.6\n","geemap 0.28.2\n","gensim 4.3.2\n","geocoder 1.38.1\n","geographiclib 2.0\n","geopandas 0.13.2\n","geopy 2.3.0\n","gin-config 0.5.0\n","glob2 0.7\n","google 2.0.3\n","google-ai-generativelanguage 0.3.3\n","google-api-core 2.11.1\n","google-api-python-client 2.84.0\n","google-auth 2.17.3\n","google-auth-httplib2 0.1.1\n","google-auth-oauthlib 1.0.0\n","google-cloud-bigquery 3.12.0\n","google-cloud-bigquery-connection 1.12.1\n","google-cloud-bigquery-storage 2.22.0\n","google-cloud-core 2.3.3\n","google-cloud-datastore 2.15.2\n","google-cloud-firestore 2.11.1\n","google-cloud-functions 1.13.3\n","google-cloud-iam 2.12.2\n","google-cloud-language 2.9.1\n","google-cloud-resource-manager 1.10.4\n","google-cloud-storage 2.8.0\n","google-cloud-translate 3.11.3\n","google-colab 1.0.0\n","google-crc32c 1.5.0\n","google-generativeai 0.2.2\n","google-pasta 0.2.0\n","google-resumable-media 2.6.0\n","googleapis-common-protos 1.61.0\n","googledrivedownloader 0.4\n","graphviz 0.20.1\n","greenlet 3.0.1\n","grpc-google-iam-v1 0.12.7\n","grpcio 1.59.2\n","grpcio-status 1.48.2\n","gspread 3.4.2\n","gspread-dataframe 3.3.1\n","gym 0.25.2\n","gym-notices 0.0.8\n","h5netcdf 1.3.0\n","h5py 3.9.0\n","holidays 0.36\n","holoviews 1.17.1\n","html5lib 1.1\n","httpimport 1.3.1\n","httplib2 0.22.0\n","huggingface-hub 0.19.4\n","humanize 4.7.0\n","hyperopt 0.2.7\n","ibis-framework 6.2.0\n","idna 3.4\n","imageio 2.31.6\n","imageio-ffmpeg 0.4.9\n","imagesize 1.4.1\n","imbalanced-learn 0.10.1\n","imgaug 0.4.0\n","importlib-metadata 6.8.0\n","importlib-resources 6.1.1\n","imutils 0.5.4\n","inflect 7.0.0\n","iniconfig 2.0.0\n","install 1.3.5\n","intel-openmp 2023.2.0\n","ipyevents 2.0.2\n","ipyfilechooser 0.6.0\n","ipykernel 5.5.6\n","ipyleaflet 0.17.4\n","ipython 7.34.0\n","ipython-genutils 0.2.0\n","ipython-sql 0.5.0\n","ipytree 0.2.2\n","ipywidgets 7.7.1\n","itsdangerous 2.1.2\n","jax 0.4.20\n","jaxlib 0.4.20+cuda11.cudnn86\n","jedi 0.19.1\n","jeepney 0.7.1\n","jieba 0.42.1\n","Jinja2 3.1.2\n","joblib 1.3.2\n","jsonpickle 3.0.2\n","jsonschema 4.19.2\n","jsonschema-specifications 2023.11.1\n","jupyter-client 6.1.12\n","jupyter-console 6.1.0\n","jupyter_core 5.5.0\n","jupyter-server 1.24.0\n","jupyterlab-pygments 0.2.2\n","jupyterlab-widgets 3.0.9\n","kaggle 1.5.16\n","keras 2.14.0\n","keyring 23.5.0\n","kiwisolver 1.4.5\n","langcodes 3.3.0\n","launchpadlib 1.10.16\n","lazr.restfulclient 0.14.4\n","lazr.uri 1.0.6\n","lazy_loader 0.3\n","libclang 16.0.6\n","librosa 0.10.1\n","lida 0.0.10\n","lightgbm 4.1.0\n","linkify-it-py 2.0.2\n","llmx 0.0.15a0\n","llvmlite 0.41.1\n","locket 1.0.0\n","logical-unification 0.4.6\n","lxml 4.9.3\n","malloy 2023.1064\n","Markdown 3.5.1\n","markdown-it-py 3.0.0\n","MarkupSafe 2.1.3\n","matplotlib 3.7.1\n","matplotlib-inline 0.1.6\n","matplotlib-venn 0.11.9\n","mdit-py-plugins 0.4.0\n","mdurl 0.1.2\n","miniKanren 1.0.3\n","missingno 0.5.2\n","mistune 0.8.4\n","mizani 0.9.3\n","mkl 2023.2.0\n","ml-collections 0.1.1\n","ml-dtypes 0.2.0\n","mlxtend 0.22.0\n","more-itertools 10.1.0\n","moviepy 1.0.3\n","mpmath 1.3.0\n","msgpack 1.0.7\n","multidict 6.0.4\n","multipledispatch 1.0.0\n","multitasking 0.0.11\n","murmurhash 1.0.10\n","music21 9.1.0\n","natsort 8.4.0\n","nbclassic 1.0.0\n","nbclient 0.9.0\n","nbconvert 6.5.4\n","nbformat 5.9.2\n","nest-asyncio 1.5.8\n","networkx 3.2.1\n","nibabel 4.0.2\n","nltk 3.8.1\n","notebook 6.5.5\n","notebook_shim 0.2.3\n","notifyemail 1.1.1\n","numba 0.58.1\n","numexpr 2.8.7\n","numpy 1.23.5\n","oauth2client 4.1.3\n","oauthlib 3.2.2\n","opencv-contrib-python 4.8.0.76\n","opencv-python 4.8.0.76\n","opencv-python-headless 4.8.1.78\n","openpyxl 3.1.2\n","opt-einsum 3.3.0\n","optax 0.1.7\n","orbax-checkpoint 0.4.2\n","osqp 0.6.2.post8\n","packaging 23.2\n","pandas 1.5.3\n","pandas-datareader 0.10.0\n","pandas-gbq 0.17.9\n","pandas-stubs 1.5.3.230304\n","pandocfilters 1.5.0\n","panel 1.3.1\n","param 2.0.1\n","parso 0.8.3\n","parsy 2.1\n","partd 1.4.1\n","pathlib 1.0.1\n","pathy 0.10.3\n","patsy 0.5.3\n","peewee 3.17.0\n","pexpect 4.8.0\n","pickleshare 0.7.5\n","Pillow 9.4.0\n","pip 23.1.2\n","pip-tools 6.13.0\n","platformdirs 4.0.0\n","plotly 5.15.0\n","plotnine 0.12.4\n","pluggy 1.3.0\n","polars 0.17.3\n","pooch 1.8.0\n","portpicker 1.5.2\n","prefetch-generator 1.0.3\n","preshed 3.0.9\n","prettytable 3.9.0\n","proglog 0.1.10\n","progressbar2 4.2.0\n","prometheus-client 0.18.0\n","promise 2.3\n","prompt-toolkit 3.0.41\n","prophet 1.1.5\n","proto-plus 1.22.3\n","protobuf 3.20.3\n","psutil 5.9.5\n","psycopg2 2.9.9\n","ptyprocess 0.7.0\n","py-cpuinfo 9.0.0\n","py4j 0.10.9.7\n","pyarrow 9.0.0\n","pyasn1 0.5.0\n","pyasn1-modules 0.3.0\n","pycocotools 2.0.7\n","pycparser 2.21\n","pyct 0.5.0\n","pydantic 1.10.13\n","pydata-google-auth 1.8.2\n","pydot 1.4.2\n","pydot-ng 2.0.0\n","pydotplus 2.0.2\n","PyDrive 1.3.1\n","PyDrive2 1.6.3\n","pyerfa 2.0.1.1\n","pygame 2.5.2\n","Pygments 2.16.1\n","PyGObject 3.42.1\n","PyJWT 2.3.0\n","pymc 5.7.2\n","pymystem3 0.2.0\n","PyOpenGL 3.1.7\n","pyOpenSSL 23.3.0\n","pyparsing 3.1.1\n","pyperclip 1.8.2\n","pyproj 3.6.1\n","pyproject_hooks 1.0.0\n","pyshp 2.3.1\n","PySocks 1.7.1\n","pytensor 2.14.2\n","pytest 7.4.3\n","python-apt 0.0.0\n","python-box 7.1.1\n","python-dateutil 2.8.2\n","python-louvain 0.16\n","python-slugify 8.0.1\n","python-utils 3.8.1\n","pytz 2023.3.post1\n","pyviz_comms 3.0.0\n","PyWavelets 1.4.1\n","PyYAML 6.0.1\n","pyzmq 23.2.1\n","qdldl 0.1.7.post0\n","qudida 0.0.4\n","ratelim 0.1.6\n","referencing 0.31.0\n","regex 2023.6.3\n","requests 2.31.0\n","requests-oauthlib 1.3.1\n","requirements-parser 0.5.0\n","rich 13.7.0\n","rpds-py 0.13.0\n","rpy2 3.4.2\n","rsa 4.9\n","safetensors 0.4.0\n","scikit-image 0.19.3\n","scikit-learn 1.2.2\n","scipy 1.11.3\n","scooby 0.9.2\n","scs 3.2.4\n","seaborn 0.12.2\n","SecretStorage 3.3.1\n","Send2Trash 1.8.2\n","setuptools 67.7.2\n","shapely 2.0.2\n","six 1.16.0\n","sklearn-pandas 2.2.0\n","smart-open 6.4.0\n","sniffio 1.3.0\n","snowballstemmer 2.2.0\n","sortedcontainers 2.4.0\n","soundfile 0.12.1\n","soupsieve 2.5\n","soxr 0.3.7\n","spacy 3.6.1\n","spacy-legacy 3.0.12\n","spacy-loggers 1.0.5\n","Sphinx 5.0.2\n","sphinxcontrib-applehelp 1.0.7\n","sphinxcontrib-devhelp 1.0.5\n","sphinxcontrib-htmlhelp 2.0.4\n","sphinxcontrib-jsmath 1.0.1\n","sphinxcontrib-qthelp 1.0.6\n","sphinxcontrib-serializinghtml 1.1.9\n","SQLAlchemy 2.0.23\n","sqlglot 17.16.2\n","sqlparse 0.4.4\n","srsly 2.4.8\n","stanio 0.3.0\n","statsmodels 0.14.0\n","sympy 1.12\n","tables 3.8.0\n","tabulate 0.9.0\n","tbb 2021.11.0\n","tblib 3.0.0\n","tenacity 8.2.3\n","tensorboard 2.14.1\n","tensorboard-data-server 0.7.2\n","tensorboardX 2.6.2.2\n","tensorflow 2.14.0\n","tensorflow-datasets 4.9.3\n","tensorflow-estimator 2.14.0\n","tensorflow-gcs-config 2.14.0\n","tensorflow-hub 0.15.0\n","tensorflow-io-gcs-filesystem 0.34.0\n","tensorflow-metadata 1.14.0\n","tensorflow-probability 0.22.0\n","tensorstore 0.1.45\n","termcolor 2.3.0\n","terminado 0.18.0\n","text-unidecode 1.3\n","textblob 0.17.1\n","tf-slim 1.1.0\n","thinc 8.1.12\n","threadpoolctl 3.2.0\n","tifffile 2023.9.26\n","timm 0.5.4\n","tinycss2 1.2.1\n","tokenizers 0.15.0\n","toml 0.10.2\n","tomli 2.0.1\n","toolz 0.12.0\n","torch 2.1.0+cu118\n","torchaudio 2.1.0+cu118\n","torchdata 0.7.0\n","torchsummary 1.5.1\n","torchtext 0.16.0\n","torchvision 0.16.0+cu118\n","tornado 6.3.2\n","tqdm 4.66.1\n","traitlets 5.7.1\n","traittypes 0.2.1\n","transformers 4.35.2\n","triton 2.1.0\n","ttach 0.0.3\n","tweepy 4.14.0\n","typer 0.9.0\n","types-pytz 2023.3.1.1\n","types-setuptools 68.2.0.1\n","typing_extensions 4.5.0\n","tzlocal 5.2\n","uc-micro-py 1.0.2\n","uritemplate 4.1.1\n","urllib3 2.0.7\n","vega-datasets 0.9.0\n","wadllib 1.3.6\n","wasabi 1.1.2\n","wcwidth 0.2.10\n","webcolors 1.13\n","webencodings 0.5.1\n","websocket-client 1.6.4\n","Werkzeug 3.0.1\n","wheel 0.41.3\n","widgetsnbextension 3.6.6\n","wordcloud 1.9.2\n","wrapt 1.14.1\n","xarray 2023.7.0\n","xarray-einstats 0.6.0\n","xgboost 2.0.2\n","xlrd 2.0.1\n","xxhash 3.4.1\n","xyzservices 2023.10.1\n","yarl 1.9.2\n","yellowbrick 1.5\n","yfinance 0.2.31\n","zict 3.0.0\n","zipp 3.17.0\n"]}],"source":["!pip list\n","!pip freeze\u003erequirements.txt\n","!cp requirements.txt ../runs"]},{"cell_type":"markdown","metadata":{"id":"h31KAx1ZZEl9"},"source":["# Pre-Training\n","* set up path by command line\n","* use argparse to set down hyper-parameter\n","\n","10000epochs will be trined with 400 images, for data-flow illustration.\n","We suggest you to use 4 * A100 SMX4 GPUs to train PuzzleTuning with CPIA dataset.\n","\n"]},{"cell_type":"markdown","metadata":{"id":"hveEqtxuZePT"},"source":["Our official training script is given here:"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"F-wStD9SYeXA"},"outputs":[],"source":["# nohup python PuzzleTuning.py --batch_size 64 --group_shuffle_size 16 --blr 1.5e-4 --epochs 200 --accum_iter 2 --print_freq 2000 --check_point_gap 50 --input_size 224 --warmup_epochs 20 --pin_mem --num_workers 32 --strategy loop --PromptTuning Deep --basic_state_dict ../saved_models/ViT_b16_224_Imagenet.pth --data_path ../datasets/All \u0026"]},{"cell_type":"markdown","metadata":{"id":"Y3pitng_YhqM"},"source":["All following lines are for data-flow illustation with colab"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"mjq7h2j1Wzdh"},"outputs":[{"name":"stdout","output_type":"stream","text":["/home/Pathology_Experiment/code\n"]}],"source":["# change working dir\n","import os\n","os.chdir(\"/home/Pathology_Experiment/code\")\n","!pwd"]},{"cell_type":"markdown","metadata":{"id":"7xGPpwiXa5vC"},"source":["Training"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"T6jwGi8oa69a"},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[1;30;43mStreaming output truncated to the last 5000 lines.\u001b[0m\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39167] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0216 (0.0216) time: 0.2915 data: 0.2048 max mem: 5716\n","Epoch: [39167] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0247 (0.0241) time: 0.0953 data: 0.0172 max mem: 5716\n","Epoch: [39167] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0247 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39168] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0211 (0.0211) time: 0.2886 data: 0.1982 max mem: 5716\n","Epoch: [39168] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0244) time: 0.0966 data: 0.0167 max mem: 5716\n","Epoch: [39168] Total time: 0:00:01 (0.1007 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39169] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0210 (0.0210) time: 0.2844 data: 0.1939 max mem: 5716\n","Epoch: [39169] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0244 (0.0242) time: 0.0944 data: 0.0163 max mem: 5716\n","Epoch: [39169] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0244 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39170] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0197 (0.0197) time: 0.2880 data: 0.2013 max mem: 5716\n","Epoch: [39170] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0199 (0.0208) time: 0.0956 data: 0.0169 max mem: 5716\n","Epoch: [39170] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0199 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39171] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0222 (0.0222) time: 0.2832 data: 0.1917 max mem: 5716\n","Epoch: [39171] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0216 (0.0208) time: 0.0948 data: 0.0161 max mem: 5716\n","Epoch: [39171] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0216 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39172] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0209 (0.0209) time: 0.2895 data: 0.1995 max mem: 5716\n","Epoch: [39172] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0206 (0.0209) time: 0.0959 data: 0.0168 max mem: 5716\n","Epoch: [39172] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0206 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39173] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0233 (0.0233) time: 0.2843 data: 0.1959 max mem: 5716\n","Epoch: [39173] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0229) time: 0.0943 data: 0.0165 max mem: 5716\n","Epoch: [39173] Total time: 0:00:01 (0.0984 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0229) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39174] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.2906 data: 0.1934 max mem: 5716\n","Epoch: [39174] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0233) time: 0.0948 data: 0.0163 max mem: 5716\n","Epoch: [39174] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39175] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0242 (0.0242) time: 0.2966 data: 0.2071 max mem: 5716\n","Epoch: [39175] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0232) time: 0.0965 data: 0.0174 max mem: 5716\n","Epoch: [39175] Total time: 0:00:01 (0.1006 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39176] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0258 (0.0258) time: 0.2936 data: 0.1950 max mem: 5716\n","Epoch: [39176] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0239) time: 0.0955 data: 0.0164 max mem: 5716\n","Epoch: [39176] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0239) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39177] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0225 (0.0225) time: 0.2921 data: 0.2054 max mem: 5716\n","Epoch: [39177] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0245 (0.0242) time: 0.0989 data: 0.0173 max mem: 5716\n","Epoch: [39177] Total time: 0:00:01 (0.1031 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0245 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39178] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0232 (0.0232) time: 0.2914 data: 0.2039 max mem: 5716\n","Epoch: [39178] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0247 (0.0242) time: 0.0949 data: 0.0171 max mem: 5716\n","Epoch: [39178] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0247 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39179] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0196 (0.0196) time: 0.2880 data: 0.1962 max mem: 5716\n","Epoch: [39179] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0196 (0.0206) time: 0.0949 data: 0.0165 max mem: 5716\n","Epoch: [39179] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0196 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39180] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0212 (0.0212) time: 0.2890 data: 0.2023 max mem: 5716\n","Epoch: [39180] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0208) time: 0.0950 data: 0.0170 max mem: 5716\n","Epoch: [39180] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39181] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0242 (0.0242) time: 0.2971 data: 0.2001 max mem: 5716\n","Epoch: [39181] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0209) time: 0.0951 data: 0.0168 max mem: 5716\n","Epoch: [39181] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39182] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0185 (0.0185) time: 0.2895 data: 0.2023 max mem: 5716\n","Epoch: [39182] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0232) time: 0.0953 data: 0.0170 max mem: 5716\n","Epoch: [39182] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39183] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0217 (0.0217) time: 0.2912 data: 0.2033 max mem: 5716\n","Epoch: [39183] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0229) time: 0.0949 data: 0.0171 max mem: 5716\n","Epoch: [39183] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0229) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39184] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0240 (0.0240) time: 0.2834 data: 0.1942 max mem: 5716\n","Epoch: [39184] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0235) time: 0.0981 data: 0.0163 max mem: 5716\n","Epoch: [39184] Total time: 0:00:01 (0.1023 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39185] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0242 (0.0242) time: 0.2991 data: 0.2109 max mem: 5716\n","Epoch: [39185] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0242) time: 0.0958 data: 0.0177 max mem: 5716\n","Epoch: [39185] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39186] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0225 (0.0225) time: 0.2837 data: 0.1913 max mem: 5716\n","Epoch: [39186] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0247) time: 0.0940 data: 0.0161 max mem: 5716\n","Epoch: [39186] Total time: 0:00:01 (0.0981 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0247) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39187] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0256 (0.0256) time: 0.2908 data: 0.2052 max mem: 5716\n","Epoch: [39187] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0243) time: 0.0953 data: 0.0172 max mem: 5716\n","Epoch: [39187] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39188] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0199 (0.0199) time: 0.2964 data: 0.2084 max mem: 5716\n","Epoch: [39188] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0210) time: 0.0958 data: 0.0175 max mem: 5716\n","Epoch: [39188] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39189] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0174 (0.0174) time: 0.2920 data: 0.2044 max mem: 5716\n","Epoch: [39189] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0203 (0.0209) time: 0.0950 data: 0.0172 max mem: 5716\n","Epoch: [39189] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0203 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39190] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0200 (0.0200) time: 0.2902 data: 0.2035 max mem: 5716\n","Epoch: [39190] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0206) time: 0.0953 data: 0.0171 max mem: 5716\n","Epoch: [39190] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39191] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0179 (0.0179) time: 0.3042 data: 0.2043 max mem: 5716\n","Epoch: [39191] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0228) time: 0.0957 data: 0.0172 max mem: 5716\n","Epoch: [39191] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0228) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39192] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0204 (0.0204) time: 0.3022 data: 0.2075 max mem: 5716\n","Epoch: [39192] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0221 (0.0234) time: 0.0965 data: 0.0174 max mem: 5716\n","Epoch: [39192] Total time: 0:00:01 (0.1007 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0221 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39193] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0246 (0.0246) time: 0.2921 data: 0.2048 max mem: 5716\n","Epoch: [39193] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0221 (0.0234) time: 0.0953 data: 0.0172 max mem: 5716\n","Epoch: [39193] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0221 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39194] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0248 (0.0248) time: 0.2807 data: 0.1907 max mem: 5716\n","Epoch: [39194] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0244) time: 0.0950 data: 0.0160 max mem: 5716\n","Epoch: [39194] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39195] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0293 (0.0293) time: 0.2886 data: 0.2005 max mem: 5716\n","Epoch: [39195] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0243 (0.0244) time: 0.0949 data: 0.0169 max mem: 5716\n","Epoch: [39195] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0243 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39196] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0213 (0.0213) time: 0.2926 data: 0.1923 max mem: 5716\n","Epoch: [39196] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0241) time: 0.0949 data: 0.0162 max mem: 5716\n","Epoch: [39196] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39197] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0213 (0.0213) time: 0.2885 data: 0.1916 max mem: 5716\n","Epoch: [39197] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0209) time: 0.0949 data: 0.0161 max mem: 5716\n","Epoch: [39197] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39198] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0235 (0.0235) time: 0.2906 data: 0.2040 max mem: 5716\n","Epoch: [39198] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0212 (0.0207) time: 0.0953 data: 0.0171 max mem: 5716\n","Epoch: [39198] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0212 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39199] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.2995 data: 0.1958 max mem: 5716\n","Epoch: [39199] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0210 (0.0207) time: 0.0957 data: 0.0165 max mem: 5716\n","Epoch: [39199] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0210 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39200] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0197 (0.0197) time: 0.2956 data: 0.2058 max mem: 5716\n","Epoch: [39200] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0233) time: 0.0962 data: 0.0173 max mem: 5716\n","Epoch: [39200] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39201] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0205 (0.0205) time: 0.2909 data: 0.2029 max mem: 5716\n","Epoch: [39201] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0232) time: 0.0997 data: 0.0171 max mem: 5716\n","Epoch: [39201] Total time: 0:00:01 (0.1040 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39202] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0227 (0.0227) time: 0.2903 data: 0.2003 max mem: 5716\n","Epoch: [39202] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0236) time: 0.0976 data: 0.0168 max mem: 5716\n","Epoch: [39202] Total time: 0:00:01 (0.1018 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0236) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39203] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0226 (0.0226) time: 0.2833 data: 0.1940 max mem: 5716\n","Epoch: [39203] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0246) time: 0.0949 data: 0.0164 max mem: 5716\n","Epoch: [39203] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39204] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0214 (0.0214) time: 0.2899 data: 0.1996 max mem: 5716\n","Epoch: [39204] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0240) time: 0.0957 data: 0.0168 max mem: 5716\n","Epoch: [39204] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0240) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39205] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0237 (0.0237) time: 0.2823 data: 0.1906 max mem: 5716\n","Epoch: [39205] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0246) time: 0.0951 data: 0.0160 max mem: 5716\n","Epoch: [39205] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39206] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0203 (0.0203) time: 0.2985 data: 0.2102 max mem: 5716\n","Epoch: [39206] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0197 (0.0207) time: 0.0963 data: 0.0177 max mem: 5716\n","Epoch: [39206] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0197 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39207] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0236 (0.0236) time: 0.2897 data: 0.1970 max mem: 5716\n","Epoch: [39207] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0210 (0.0210) time: 0.0959 data: 0.0166 max mem: 5716\n","Epoch: [39207] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0210 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39208] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0245 (0.0245) time: 0.2897 data: 0.1998 max mem: 5716\n","Epoch: [39208] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0207) time: 0.0950 data: 0.0168 max mem: 5716\n","Epoch: [39208] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39209] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0242 (0.0242) time: 0.3004 data: 0.2061 max mem: 5716\n","Epoch: [39209] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0233) time: 0.0971 data: 0.0173 max mem: 5716\n","Epoch: [39209] Total time: 0:00:01 (0.1027 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39210] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0226 (0.0226) time: 0.2937 data: 0.2054 max mem: 5716\n","Epoch: [39210] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0221 (0.0231) time: 0.0966 data: 0.0173 max mem: 5716\n","Epoch: [39210] Total time: 0:00:01 (0.1008 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0221 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39211] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0198 (0.0198) time: 0.2867 data: 0.1955 max mem: 5716\n","Epoch: [39211] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0232) time: 0.0956 data: 0.0165 max mem: 5716\n","Epoch: [39211] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39212] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.3049 data: 0.2065 max mem: 5716\n","Epoch: [39212] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0247 (0.0245) time: 0.0965 data: 0.0173 max mem: 5716\n","Epoch: [39212] Total time: 0:00:01 (0.1006 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0247 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39213] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0253 (0.0253) time: 0.2945 data: 0.2034 max mem: 5716\n","Epoch: [39213] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0253 (0.0246) time: 0.0963 data: 0.0171 max mem: 5716\n","Epoch: [39213] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0253 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39214] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0289 (0.0289) time: 0.2878 data: 0.1981 max mem: 5716\n","Epoch: [39214] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0243) time: 0.0951 data: 0.0167 max mem: 5716\n","Epoch: [39214] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39215] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0234 (0.0234) time: 0.2811 data: 0.1922 max mem: 5716\n","Epoch: [39215] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0212) time: 0.0943 data: 0.0162 max mem: 5716\n","Epoch: [39215] Total time: 0:00:01 (0.0984 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0212) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39216] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0228 (0.0228) time: 0.2962 data: 0.2016 max mem: 5716\n","Epoch: [39216] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0209) time: 0.0953 data: 0.0169 max mem: 5716\n","Epoch: [39216] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39217] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0168 (0.0168) time: 0.2777 data: 0.1891 max mem: 5716\n","Epoch: [39217] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0207) time: 0.0941 data: 0.0159 max mem: 5716\n","Epoch: [39217] Total time: 0:00:01 (0.0982 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39218] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2812 data: 0.1868 max mem: 5716\n","Epoch: [39218] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0231) time: 0.0970 data: 0.0157 max mem: 5716\n","Epoch: [39218] Total time: 0:00:01 (0.1011 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39219] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0223 (0.0223) time: 0.2862 data: 0.1974 max mem: 5716\n","Epoch: [39219] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0235) time: 0.0979 data: 0.0166 max mem: 5716\n","Epoch: [39219] Total time: 0:00:01 (0.1023 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39220] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0253 (0.0253) time: 0.2909 data: 0.2040 max mem: 5716\n","Epoch: [39220] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0233) time: 0.0955 data: 0.0172 max mem: 5716\n","Epoch: [39220] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39221] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0226 (0.0226) time: 0.2874 data: 0.1964 max mem: 5716\n","Epoch: [39221] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0243) time: 0.0957 data: 0.0165 max mem: 5716\n","Epoch: [39221] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39222] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0258 (0.0258) time: 0.3034 data: 0.2141 max mem: 5716\n","Epoch: [39222] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0244) time: 0.0962 data: 0.0180 max mem: 5716\n","Epoch: [39222] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39223] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0304 (0.0304) time: 0.2919 data: 0.2009 max mem: 5716\n","Epoch: [39223] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0245) time: 0.0958 data: 0.0169 max mem: 5716\n","Epoch: [39223] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39224] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0244 (0.0244) time: 0.2855 data: 0.1900 max mem: 5716\n","Epoch: [39224] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0208) time: 0.0943 data: 0.0160 max mem: 5716\n","Epoch: [39224] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39225] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0215 (0.0215) time: 0.2878 data: 0.1984 max mem: 5716\n","Epoch: [39225] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0209) time: 0.0958 data: 0.0167 max mem: 5716\n","Epoch: [39225] Total time: 0:00:01 (0.1012 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39226] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0218 (0.0218) time: 0.2885 data: 0.1955 max mem: 5716\n","Epoch: [39226] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0214 (0.0207) time: 0.0952 data: 0.0164 max mem: 5716\n","Epoch: [39226] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0214 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39227] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0191 (0.0191) time: 0.2841 data: 0.1937 max mem: 5716\n","Epoch: [39227] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0233) time: 0.0953 data: 0.0163 max mem: 5716\n","Epoch: [39227] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39228] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0243 (0.0243) time: 0.2876 data: 0.1998 max mem: 5716\n","Epoch: [39228] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0233) time: 0.0955 data: 0.0168 max mem: 5716\n","Epoch: [39228] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39229] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0185 (0.0185) time: 0.2777 data: 0.1882 max mem: 5716\n","Epoch: [39229] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0231) time: 0.0936 data: 0.0158 max mem: 5716\n","Epoch: [39229] Total time: 0:00:01 (0.0978 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39230] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0309 (0.0309) time: 0.2854 data: 0.1951 max mem: 5716\n","Epoch: [39230] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0245) time: 0.0952 data: 0.0164 max mem: 5716\n","Epoch: [39230] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39231] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0236 (0.0236) time: 0.3027 data: 0.2006 max mem: 5716\n","Epoch: [39231] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0245 (0.0246) time: 0.0963 data: 0.0169 max mem: 5716\n","Epoch: [39231] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0245 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39232] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0215 (0.0215) time: 0.3095 data: 0.2085 max mem: 5716\n","Epoch: [39232] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0248 (0.0245) time: 0.1007 data: 0.0175 max mem: 5716\n","Epoch: [39232] Total time: 0:00:01 (0.1049 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0248 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39233] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0242 (0.0242) time: 0.2904 data: 0.2033 max mem: 5716\n","Epoch: [39233] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0208) time: 0.0979 data: 0.0171 max mem: 5716\n","Epoch: [39233] Total time: 0:00:01 (0.1020 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39234] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0232 (0.0232) time: 0.2928 data: 0.2039 max mem: 5716\n","Epoch: [39234] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0208) time: 0.0950 data: 0.0171 max mem: 5716\n","Epoch: [39234] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39235] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0211 (0.0211) time: 0.2861 data: 0.1957 max mem: 5716\n","Epoch: [39235] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0206) time: 0.0943 data: 0.0165 max mem: 5716\n","Epoch: [39235] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39236] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0195 (0.0195) time: 0.2903 data: 0.2042 max mem: 5716\n","Epoch: [39236] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0234) time: 0.0954 data: 0.0172 max mem: 5716\n","Epoch: [39236] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39237] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0221 (0.0221) time: 0.2978 data: 0.1980 max mem: 5716\n","Epoch: [39237] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0230) time: 0.0956 data: 0.0167 max mem: 5716\n","Epoch: [39237] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39238] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0212 (0.0212) time: 0.2901 data: 0.2025 max mem: 5716\n","Epoch: [39238] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0234) time: 0.0947 data: 0.0170 max mem: 5716\n","Epoch: [39238] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39239] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0236 (0.0236) time: 0.2912 data: 0.2024 max mem: 5716\n","Epoch: [39239] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0245) time: 0.0954 data: 0.0170 max mem: 5716\n","Epoch: [39239] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39240] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0229 (0.0229) time: 0.2897 data: 0.2009 max mem: 5716\n","Epoch: [39240] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0241) time: 0.0949 data: 0.0169 max mem: 5716\n","Epoch: [39240] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39241] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0210 (0.0210) time: 0.2953 data: 0.2065 max mem: 5716\n","Epoch: [39241] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0242) time: 0.0956 data: 0.0174 max mem: 5716\n","Epoch: [39241] Total time: 0:00:01 (0.1007 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39242] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0205 (0.0205) time: 0.2895 data: 0.2020 max mem: 5716\n","Epoch: [39242] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0209 (0.0208) time: 0.0950 data: 0.0170 max mem: 5716\n","Epoch: [39242] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0209 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39243] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0227 (0.0227) time: 0.2968 data: 0.2077 max mem: 5716\n","Epoch: [39243] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0204 (0.0207) time: 0.0974 data: 0.0175 max mem: 5716\n","Epoch: [39243] Total time: 0:00:01 (0.1027 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0204 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39244] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0200 (0.0200) time: 0.3002 data: 0.2107 max mem: 5716\n","Epoch: [39244] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0210 (0.0210) time: 0.0974 data: 0.0177 max mem: 5716\n","Epoch: [39244] Total time: 0:00:01 (0.1017 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0210 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39245] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0180 (0.0180) time: 0.3023 data: 0.2036 max mem: 5716\n","Epoch: [39245] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0232) time: 0.0966 data: 0.0171 max mem: 5716\n","Epoch: [39245] Total time: 0:00:01 (0.1008 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39246] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0231 (0.0231) time: 0.2818 data: 0.1918 max mem: 5716\n","Epoch: [39246] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0236) time: 0.0964 data: 0.0161 max mem: 5716\n","Epoch: [39246] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0236) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39247] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0246 (0.0246) time: 0.2810 data: 0.1918 max mem: 5716\n","Epoch: [39247] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0236) time: 0.0943 data: 0.0161 max mem: 5716\n","Epoch: [39247] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0236) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39248] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0255 (0.0255) time: 0.2887 data: 0.2019 max mem: 5716\n","Epoch: [39248] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0240) time: 0.0955 data: 0.0170 max mem: 5716\n","Epoch: [39248] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0240) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39249] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0260 (0.0260) time: 0.2970 data: 0.2001 max mem: 5716\n","Epoch: [39249] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0257 (0.0246) time: 0.0954 data: 0.0168 max mem: 5716\n","Epoch: [39249] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0257 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39250] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0247 (0.0247) time: 0.2884 data: 0.2014 max mem: 5716\n","Epoch: [39250] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0245) time: 0.0966 data: 0.0169 max mem: 5716\n","Epoch: [39250] Total time: 0:00:01 (0.1007 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39251] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0240 (0.0240) time: 0.2818 data: 0.1910 max mem: 5716\n","Epoch: [39251] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0211) time: 0.0947 data: 0.0161 max mem: 5716\n","Epoch: [39251] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39252] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0157 (0.0157) time: 0.2865 data: 0.1876 max mem: 5716\n","Epoch: [39252] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0208) time: 0.0955 data: 0.0158 max mem: 5716\n","Epoch: [39252] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39253] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0179 (0.0179) time: 0.2830 data: 0.1943 max mem: 5716\n","Epoch: [39253] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0209) time: 0.0978 data: 0.0164 max mem: 5716\n","Epoch: [39253] Total time: 0:00:01 (0.1019 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39254] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0242 (0.0242) time: 0.2960 data: 0.2054 max mem: 5716\n","Epoch: [39254] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0230) time: 0.0967 data: 0.0173 max mem: 5716\n","Epoch: [39254] Total time: 0:00:01 (0.1009 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39255] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0220 (0.0220) time: 0.3060 data: 0.2102 max mem: 5716\n","Epoch: [39255] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0234) time: 0.0975 data: 0.0177 max mem: 5716\n","Epoch: [39255] Total time: 0:00:01 (0.1017 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39256] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0265 (0.0265) time: 0.2931 data: 0.2047 max mem: 5716\n","Epoch: [39256] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0232) time: 0.0972 data: 0.0172 max mem: 5716\n","Epoch: [39256] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39257] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0259 (0.0259) time: 0.2882 data: 0.1969 max mem: 5716\n","Epoch: [39257] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0242) time: 0.0948 data: 0.0166 max mem: 5716\n","Epoch: [39257] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39258] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0295 (0.0295) time: 0.2896 data: 0.2026 max mem: 5716\n","Epoch: [39258] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0244 (0.0244) time: 0.0948 data: 0.0170 max mem: 5716\n","Epoch: [39258] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0244 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39259] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0257 (0.0257) time: 0.3086 data: 0.2031 max mem: 5716\n","Epoch: [39259] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0246 (0.0244) time: 0.0964 data: 0.0171 max mem: 5716\n","Epoch: [39259] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0246 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39260] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0204 (0.0204) time: 0.2958 data: 0.1971 max mem: 5716\n","Epoch: [39260] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0204) time: 0.0960 data: 0.0166 max mem: 5716\n","Epoch: [39260] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0204) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39261] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0202 (0.0202) time: 0.2877 data: 0.1933 max mem: 5716\n","Epoch: [39261] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0209 (0.0208) time: 0.0967 data: 0.0163 max mem: 5716\n","Epoch: [39261] Total time: 0:00:01 (0.1008 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0209 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39262] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0186 (0.0186) time: 0.2846 data: 0.1900 max mem: 5716\n","Epoch: [39262] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0196 (0.0208) time: 0.0957 data: 0.0160 max mem: 5716\n","Epoch: [39262] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0196 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39263] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0225 (0.0225) time: 0.2910 data: 0.2053 max mem: 5716\n","Epoch: [39263] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0232) time: 0.0951 data: 0.0172 max mem: 5716\n","Epoch: [39263] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39264] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0270 (0.0270) time: 0.2872 data: 0.1980 max mem: 5716\n","Epoch: [39264] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0233) time: 0.0948 data: 0.0166 max mem: 5716\n","Epoch: [39264] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39265] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0293 (0.0293) time: 0.2850 data: 0.1967 max mem: 5716\n","Epoch: [39265] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0233) time: 0.0965 data: 0.0166 max mem: 5716\n","Epoch: [39265] Total time: 0:00:01 (0.1006 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39266] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0250 (0.0250) time: 0.3023 data: 0.2050 max mem: 5716\n","Epoch: [39266] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0243 (0.0247) time: 0.0963 data: 0.0172 max mem: 5716\n","Epoch: [39266] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0243 (0.0247) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39267] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0247 (0.0247) time: 0.2872 data: 0.1971 max mem: 5716\n","Epoch: [39267] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0242) time: 0.0954 data: 0.0166 max mem: 5716\n","Epoch: [39267] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39268] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0243 (0.0243) time: 0.2984 data: 0.1955 max mem: 5716\n","Epoch: [39268] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0243 (0.0242) time: 0.0958 data: 0.0164 max mem: 5716\n","Epoch: [39268] Total time: 0:00:01 (0.1006 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0243 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39269] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0235 (0.0235) time: 0.2824 data: 0.1935 max mem: 5716\n","Epoch: [39269] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0207) time: 0.0945 data: 0.0163 max mem: 5716\n","Epoch: [39269] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39270] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0201 (0.0201) time: 0.2876 data: 0.1994 max mem: 5716\n","Epoch: [39270] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0209) time: 0.0955 data: 0.0168 max mem: 5716\n","Epoch: [39270] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39271] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0138 (0.0138) time: 0.2786 data: 0.1895 max mem: 5716\n","Epoch: [39271] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0196 (0.0208) time: 0.0942 data: 0.0159 max mem: 5716\n","Epoch: [39271] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0196 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39272] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0294 (0.0294) time: 0.2877 data: 0.1978 max mem: 5716\n","Epoch: [39272] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0234) time: 0.0942 data: 0.0166 max mem: 5716\n","Epoch: [39272] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39273] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0253 (0.0253) time: 0.2798 data: 0.1891 max mem: 5716\n","Epoch: [39273] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0231) time: 0.0938 data: 0.0159 max mem: 5716\n","Epoch: [39273] Total time: 0:00:01 (0.0979 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39274] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0213 (0.0213) time: 0.2887 data: 0.1994 max mem: 5716\n","Epoch: [39274] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0218 (0.0230) time: 0.0956 data: 0.0168 max mem: 5716\n","Epoch: [39274] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0218 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39275] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2903 data: 0.2029 max mem: 5716\n","Epoch: [39275] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0244) time: 0.0950 data: 0.0171 max mem: 5716\n","Epoch: [39275] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39276] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0230 (0.0230) time: 0.2872 data: 0.1978 max mem: 5716\n","Epoch: [39276] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0243) time: 0.0950 data: 0.0167 max mem: 5716\n","Epoch: [39276] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39277] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0220 (0.0220) time: 0.2851 data: 0.1966 max mem: 5716\n","Epoch: [39277] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0243 (0.0244) time: 0.0966 data: 0.0166 max mem: 5716\n","Epoch: [39277] Total time: 0:00:01 (0.1008 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0243 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39278] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0222 (0.0222) time: 0.2969 data: 0.2074 max mem: 5716\n","Epoch: [39278] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0214 (0.0205) time: 0.0985 data: 0.0174 max mem: 5716\n","Epoch: [39278] Total time: 0:00:01 (0.1026 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0214 (0.0205) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39279] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0169 (0.0169) time: 0.3155 data: 0.2083 max mem: 5716\n","Epoch: [39279] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0190 (0.0209) time: 0.0989 data: 0.0175 max mem: 5716\n","Epoch: [39279] Total time: 0:00:01 (0.1031 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0190 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39280] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0237 (0.0237) time: 0.2868 data: 0.1948 max mem: 5716\n","Epoch: [39280] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.0961 data: 0.0164 max mem: 5716\n","Epoch: [39280] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39281] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0176 (0.0176) time: 0.3083 data: 0.2072 max mem: 5716\n","Epoch: [39281] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0233) time: 0.0981 data: 0.0174 max mem: 5716\n","Epoch: [39281] Total time: 0:00:01 (0.1023 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39282] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0257 (0.0257) time: 0.2879 data: 0.1935 max mem: 5716\n","Epoch: [39282] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0222 (0.0233) time: 0.0948 data: 0.0163 max mem: 5716\n","Epoch: [39282] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0222 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39283] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0275 (0.0275) time: 0.2888 data: 0.2001 max mem: 5716\n","Epoch: [39283] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0235) time: 0.0980 data: 0.0168 max mem: 5716\n","Epoch: [39283] Total time: 0:00:01 (0.1034 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39284] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0222 (0.0222) time: 0.2824 data: 0.1924 max mem: 5716\n","Epoch: [39284] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0243) time: 0.0943 data: 0.0162 max mem: 5716\n","Epoch: [39284] Total time: 0:00:01 (0.0984 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39285] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0252 (0.0252) time: 0.2933 data: 0.2068 max mem: 5716\n","Epoch: [39285] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0252 (0.0242) time: 0.0957 data: 0.0174 max mem: 5716\n","Epoch: [39285] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0252 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39286] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0252 (0.0252) time: 0.2828 data: 0.1932 max mem: 5716\n","Epoch: [39286] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0243 (0.0244) time: 0.0949 data: 0.0163 max mem: 5716\n","Epoch: [39286] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0243 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39287] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0194 (0.0194) time: 0.2892 data: 0.1998 max mem: 5716\n","Epoch: [39287] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0196 (0.0207) time: 0.0955 data: 0.0168 max mem: 5716\n","Epoch: [39287] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0196 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39288] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0238 (0.0238) time: 0.2924 data: 0.2050 max mem: 5716\n","Epoch: [39288] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0221 (0.0211) time: 0.0953 data: 0.0172 max mem: 5716\n","Epoch: [39288] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0221 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39289] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0233 (0.0233) time: 0.2929 data: 0.2031 max mem: 5716\n","Epoch: [39289] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0199 (0.0207) time: 0.0959 data: 0.0171 max mem: 5716\n","Epoch: [39289] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0199 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39290] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0284 (0.0284) time: 0.2988 data: 0.2124 max mem: 5716\n","Epoch: [39290] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0231) time: 0.0993 data: 0.0179 max mem: 5716\n","Epoch: [39290] Total time: 0:00:01 (0.1035 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39291] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0193 (0.0193) time: 0.2965 data: 0.2073 max mem: 5716\n","Epoch: [39291] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0230) time: 0.0957 data: 0.0174 max mem: 5716\n","Epoch: [39291] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39292] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0218 (0.0218) time: 0.2948 data: 0.2041 max mem: 5716\n","Epoch: [39292] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0233) time: 0.0958 data: 0.0172 max mem: 5716\n","Epoch: [39292] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39293] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0230 (0.0230) time: 0.2902 data: 0.1992 max mem: 5716\n","Epoch: [39293] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0242) time: 0.0953 data: 0.0168 max mem: 5716\n","Epoch: [39293] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39294] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0232 (0.0232) time: 0.2873 data: 0.1972 max mem: 5716\n","Epoch: [39294] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0243 (0.0242) time: 0.0950 data: 0.0166 max mem: 5716\n","Epoch: [39294] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0243 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39295] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0217 (0.0217) time: 0.2893 data: 0.1985 max mem: 5716\n","Epoch: [39295] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0249 (0.0244) time: 0.0961 data: 0.0167 max mem: 5716\n","Epoch: [39295] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0249 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39296] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0273 (0.0273) time: 0.2948 data: 0.2057 max mem: 5716\n","Epoch: [39296] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0208) time: 0.0975 data: 0.0173 max mem: 5716\n","Epoch: [39296] Total time: 0:00:01 (0.1019 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39297] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0246 (0.0246) time: 0.3022 data: 0.2031 max mem: 5716\n","Epoch: [39297] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0209) time: 0.0973 data: 0.0171 max mem: 5716\n","Epoch: [39297] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39298] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0234 (0.0234) time: 0.3028 data: 0.2161 max mem: 5716\n","Epoch: [39298] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0205) time: 0.0964 data: 0.0182 max mem: 5716\n","Epoch: [39298] Total time: 0:00:01 (0.1006 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0205) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39299] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0216 (0.0216) time: 0.2877 data: 0.1969 max mem: 5716\n","Epoch: [39299] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0235) time: 0.0975 data: 0.0166 max mem: 5716\n","Epoch: [39299] Total time: 0:00:01 (0.1017 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39300] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0263 (0.0263) time: 0.2901 data: 0.2014 max mem: 5716\n","Epoch: [39300] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0221 (0.0230) time: 0.0953 data: 0.0169 max mem: 5716\n","Epoch: [39300] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0221 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39301] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0231 (0.0231) time: 0.2887 data: 0.2025 max mem: 5716\n","Epoch: [39301] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0226) time: 0.0957 data: 0.0170 max mem: 5716\n","Epoch: [39301] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0226) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39302] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0207 (0.0207) time: 0.2904 data: 0.2032 max mem: 5716\n","Epoch: [39302] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0249 (0.0247) time: 0.0948 data: 0.0171 max mem: 5716\n","Epoch: [39302] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0249 (0.0247) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39303] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0254 (0.0254) time: 0.2877 data: 0.1973 max mem: 5716\n","Epoch: [39303] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0244) time: 0.0951 data: 0.0166 max mem: 5716\n","Epoch: [39303] Total time: 0:00:01 (0.1007 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39304] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0187 (0.0187) time: 0.2890 data: 0.1934 max mem: 5716\n","Epoch: [39304] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0241) time: 0.0956 data: 0.0163 max mem: 5716\n","Epoch: [39304] Total time: 0:00:01 (0.1011 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39305] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0211 (0.0211) time: 0.3003 data: 0.1971 max mem: 5716\n","Epoch: [39305] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0214 (0.0211) time: 0.0972 data: 0.0166 max mem: 5716\n","Epoch: [39305] Total time: 0:00:01 (0.1012 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0214 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39306] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0215 (0.0215) time: 0.2831 data: 0.1922 max mem: 5716\n","Epoch: [39306] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0210 (0.0207) time: 0.0942 data: 0.0162 max mem: 5716\n","Epoch: [39306] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0210 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39307] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0206 (0.0206) time: 0.2922 data: 0.2053 max mem: 5716\n","Epoch: [39307] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0206 (0.0209) time: 0.0947 data: 0.0172 max mem: 5716\n","Epoch: [39307] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0206 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39308] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0213 (0.0213) time: 0.2921 data: 0.2040 max mem: 5716\n","Epoch: [39308] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0232) time: 0.0963 data: 0.0172 max mem: 5716\n","Epoch: [39308] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39309] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0206 (0.0206) time: 0.2957 data: 0.2084 max mem: 5716\n","Epoch: [39309] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0221 (0.0231) time: 0.0979 data: 0.0175 max mem: 5716\n","Epoch: [39309] Total time: 0:00:01 (0.1021 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0221 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39310] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0260 (0.0260) time: 0.2996 data: 0.2104 max mem: 5716\n","Epoch: [39310] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0232) time: 0.0958 data: 0.0177 max mem: 5716\n","Epoch: [39310] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39311] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0266 (0.0266) time: 0.2889 data: 0.1977 max mem: 5716\n","Epoch: [39311] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0244) time: 0.0952 data: 0.0166 max mem: 5716\n","Epoch: [39311] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39312] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0235 (0.0235) time: 0.3035 data: 0.2143 max mem: 5716\n","Epoch: [39312] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0246 (0.0246) time: 0.0960 data: 0.0180 max mem: 5716\n","Epoch: [39312] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0246 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39313] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0202 (0.0202) time: 0.2919 data: 0.2040 max mem: 5716\n","Epoch: [39313] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0248 (0.0245) time: 0.0958 data: 0.0172 max mem: 5716\n","Epoch: [39313] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0248 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39314] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0225 (0.0225) time: 0.2956 data: 0.2077 max mem: 5716\n","Epoch: [39314] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.0955 data: 0.0174 max mem: 5716\n","Epoch: [39314] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39315] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0235 (0.0235) time: 0.2819 data: 0.1896 max mem: 5716\n","Epoch: [39315] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0209 (0.0208) time: 0.0944 data: 0.0160 max mem: 5716\n","Epoch: [39315] Total time: 0:00:01 (0.0986 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0209 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39316] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0224 (0.0224) time: 0.2850 data: 0.1975 max mem: 5716\n","Epoch: [39316] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0208) time: 0.0962 data: 0.0166 max mem: 5716\n","Epoch: [39316] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39317] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0196 (0.0196) time: 0.2869 data: 0.1993 max mem: 5716\n","Epoch: [39317] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0230) time: 0.0954 data: 0.0168 max mem: 5716\n","Epoch: [39317] Total time: 0:00:01 (0.1013 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39318] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0237 (0.0237) time: 0.2861 data: 0.1969 max mem: 5716\n","Epoch: [39318] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0233) time: 0.0946 data: 0.0165 max mem: 5716\n","Epoch: [39318] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39319] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0260 (0.0260) time: 0.3064 data: 0.2054 max mem: 5716\n","Epoch: [39319] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0228) time: 0.0964 data: 0.0173 max mem: 5716\n","Epoch: [39319] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0228) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39320] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0260 (0.0260) time: 0.2993 data: 0.2096 max mem: 5716\n","Epoch: [39320] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0243) time: 0.0958 data: 0.0176 max mem: 5716\n","Epoch: [39320] Total time: 0:00:01 (0.1012 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39321] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0219 (0.0219) time: 0.2893 data: 0.1982 max mem: 5716\n","Epoch: [39321] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0239) time: 0.0962 data: 0.0167 max mem: 5716\n","Epoch: [39321] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0239) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39322] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0294 (0.0294) time: 0.2967 data: 0.2074 max mem: 5716\n","Epoch: [39322] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0244 (0.0243) time: 0.0969 data: 0.0174 max mem: 5716\n","Epoch: [39322] Total time: 0:00:01 (0.1011 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0244 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39323] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0199 (0.0199) time: 0.2986 data: 0.2078 max mem: 5716\n","Epoch: [39323] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0208) time: 0.0960 data: 0.0175 max mem: 5716\n","Epoch: [39323] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39324] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0207 (0.0207) time: 0.2936 data: 0.2038 max mem: 5716\n","Epoch: [39324] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0209 (0.0208) time: 0.0953 data: 0.0172 max mem: 5716\n","Epoch: [39324] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0209 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39325] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0221 (0.0221) time: 0.2884 data: 0.1977 max mem: 5716\n","Epoch: [39325] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0212 (0.0206) time: 0.0947 data: 0.0166 max mem: 5716\n","Epoch: [39325] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0212 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39326] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0266 (0.0266) time: 0.2933 data: 0.2058 max mem: 5716\n","Epoch: [39326] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0231) time: 0.0956 data: 0.0173 max mem: 5716\n","Epoch: [39326] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39327] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0194 (0.0194) time: 0.2892 data: 0.2018 max mem: 5716\n","Epoch: [39327] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0232) time: 0.0945 data: 0.0169 max mem: 5716\n","Epoch: [39327] Total time: 0:00:01 (0.0986 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39328] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0182 (0.0182) time: 0.2943 data: 0.2059 max mem: 5716\n","Epoch: [39328] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0222 (0.0230) time: 0.0956 data: 0.0173 max mem: 5716\n","Epoch: [39328] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0222 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39329] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0258 (0.0258) time: 0.2929 data: 0.2042 max mem: 5716\n","Epoch: [39329] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0242) time: 0.0984 data: 0.0172 max mem: 5716\n","Epoch: [39329] Total time: 0:00:01 (0.1026 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39330] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0258 (0.0258) time: 0.2955 data: 0.1973 max mem: 5716\n","Epoch: [39330] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0242) time: 0.0975 data: 0.0166 max mem: 5716\n","Epoch: [39330] Total time: 0:00:01 (0.1030 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39331] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0246 (0.0246) time: 0.2923 data: 0.2054 max mem: 5716\n","Epoch: [39331] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0245 (0.0244) time: 0.0950 data: 0.0173 max mem: 5716\n","Epoch: [39331] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0245 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39332] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0187 (0.0187) time: 0.2885 data: 0.1965 max mem: 5716\n","Epoch: [39332] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0195 (0.0207) time: 0.0961 data: 0.0166 max mem: 5716\n","Epoch: [39332] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0195 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39333] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0219 (0.0219) time: 0.2901 data: 0.2013 max mem: 5716\n","Epoch: [39333] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0207) time: 0.0957 data: 0.0169 max mem: 5716\n","Epoch: [39333] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39334] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0220 (0.0220) time: 0.2923 data: 0.2029 max mem: 5716\n","Epoch: [39334] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0208) time: 0.0957 data: 0.0171 max mem: 5716\n","Epoch: [39334] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39335] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0210 (0.0210) time: 0.3042 data: 0.2031 max mem: 5716\n","Epoch: [39335] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0234) time: 0.0972 data: 0.0171 max mem: 5716\n","Epoch: [39335] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39336] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0223 (0.0223) time: 0.2913 data: 0.1921 max mem: 5716\n","Epoch: [39336] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0223 (0.0229) time: 0.0954 data: 0.0162 max mem: 5716\n","Epoch: [39336] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0223 (0.0229) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39337] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0255 (0.0255) time: 0.2931 data: 0.2002 max mem: 5716\n","Epoch: [39337] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0234) time: 0.0966 data: 0.0168 max mem: 5716\n","Epoch: [39337] Total time: 0:00:01 (0.1022 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39338] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0294 (0.0294) time: 0.2788 data: 0.1892 max mem: 5716\n","Epoch: [39338] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0241) time: 0.0955 data: 0.0159 max mem: 5716\n","Epoch: [39338] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39339] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0302 (0.0302) time: 0.2845 data: 0.1928 max mem: 5716\n","Epoch: [39339] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0242) time: 0.0957 data: 0.0162 max mem: 5716\n","Epoch: [39339] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39340] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0200 (0.0200) time: 0.2937 data: 0.2059 max mem: 5716\n","Epoch: [39340] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0243) time: 0.0952 data: 0.0173 max mem: 5716\n","Epoch: [39340] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39341] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0247 (0.0247) time: 0.2833 data: 0.1919 max mem: 5716\n","Epoch: [39341] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0204 (0.0211) time: 0.0949 data: 0.0161 max mem: 5716\n","Epoch: [39341] Total time: 0:00:01 (0.1010 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0204 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39342] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0218 (0.0218) time: 0.2874 data: 0.1968 max mem: 5716\n","Epoch: [39342] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0196 (0.0206) time: 0.0950 data: 0.0165 max mem: 5716\n","Epoch: [39342] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0196 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39343] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0218 (0.0218) time: 0.2947 data: 0.2089 max mem: 5716\n","Epoch: [39343] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0205) time: 0.0954 data: 0.0176 max mem: 5716\n","Epoch: [39343] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0205) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39344] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2924 data: 0.2050 max mem: 5716\n","Epoch: [39344] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0233) time: 0.0952 data: 0.0173 max mem: 5716\n","Epoch: [39344] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39345] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0194 (0.0194) time: 0.2913 data: 0.1924 max mem: 5716\n","Epoch: [39345] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0222 (0.0230) time: 0.0958 data: 0.0162 max mem: 5716\n","Epoch: [39345] Total time: 0:00:01 (0.1013 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0222 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39346] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0227 (0.0227) time: 0.2926 data: 0.2009 max mem: 5716\n","Epoch: [39346] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0231) time: 0.0973 data: 0.0169 max mem: 5716\n","Epoch: [39346] Total time: 0:00:01 (0.1022 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39347] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0232 (0.0232) time: 0.2924 data: 0.2001 max mem: 5716\n","Epoch: [39347] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0242) time: 0.0960 data: 0.0168 max mem: 5716\n","Epoch: [39347] Total time: 0:00:01 (0.1011 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39348] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0215 (0.0215) time: 0.2889 data: 0.1950 max mem: 5716\n","Epoch: [39348] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0244) time: 0.0951 data: 0.0164 max mem: 5716\n","Epoch: [39348] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39349] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0232 (0.0232) time: 0.2858 data: 0.1966 max mem: 5716\n","Epoch: [39349] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0243) time: 0.0949 data: 0.0165 max mem: 5716\n","Epoch: [39349] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39350] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0203 (0.0203) time: 0.3145 data: 0.2150 max mem: 5716\n","Epoch: [39350] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0203 (0.0210) time: 0.0975 data: 0.0181 max mem: 5716\n","Epoch: [39350] Total time: 0:00:01 (0.1016 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0203 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39351] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0201 (0.0201) time: 0.2968 data: 0.2113 max mem: 5716\n","Epoch: [39351] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0204 (0.0206) time: 0.0952 data: 0.0178 max mem: 5716\n","Epoch: [39351] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0204 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39352] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0198 (0.0198) time: 0.2917 data: 0.2048 max mem: 5716\n","Epoch: [39352] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0198 (0.0206) time: 0.0952 data: 0.0172 max mem: 5716\n","Epoch: [39352] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0198 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39353] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0214 (0.0214) time: 0.2884 data: 0.1955 max mem: 5716\n","Epoch: [39353] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0233) time: 0.0981 data: 0.0164 max mem: 5716\n","Epoch: [39353] Total time: 0:00:01 (0.1023 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39354] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0274 (0.0274) time: 0.2954 data: 0.2074 max mem: 5716\n","Epoch: [39354] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0219 (0.0231) time: 0.0979 data: 0.0175 max mem: 5716\n","Epoch: [39354] Total time: 0:00:01 (0.1020 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0219 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39355] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0268 (0.0268) time: 0.2996 data: 0.2111 max mem: 5716\n","Epoch: [39355] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0222 (0.0229) time: 0.0969 data: 0.0177 max mem: 5716\n","Epoch: [39355] Total time: 0:00:01 (0.1010 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0222 (0.0229) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39356] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2927 data: 0.1996 max mem: 5716\n","Epoch: [39356] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0243 (0.0242) time: 0.0962 data: 0.0168 max mem: 5716\n","Epoch: [39356] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0243 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39357] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0201 (0.0201) time: 0.2865 data: 0.1964 max mem: 5716\n","Epoch: [39357] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0246 (0.0247) time: 0.0950 data: 0.0165 max mem: 5716\n","Epoch: [39357] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0246 (0.0247) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39358] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0291 (0.0291) time: 0.2896 data: 0.1981 max mem: 5716\n","Epoch: [39358] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0243) time: 0.0979 data: 0.0167 max mem: 5716\n","Epoch: [39358] Total time: 0:00:01 (0.1020 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39359] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0224 (0.0224) time: 0.2929 data: 0.2044 max mem: 5716\n","Epoch: [39359] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0222 (0.0209) time: 0.0952 data: 0.0172 max mem: 5716\n","Epoch: [39359] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0222 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39360] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0261 (0.0261) time: 0.2943 data: 0.2068 max mem: 5716\n","Epoch: [39360] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0208) time: 0.0955 data: 0.0174 max mem: 5716\n","Epoch: [39360] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39361] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0161 (0.0161) time: 0.2958 data: 0.2077 max mem: 5716\n","Epoch: [39361] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0207) time: 0.0953 data: 0.0175 max mem: 5716\n","Epoch: [39361] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39362] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0249 (0.0249) time: 0.2944 data: 0.2051 max mem: 5716\n","Epoch: [39362] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0230) time: 0.0952 data: 0.0172 max mem: 5716\n","Epoch: [39362] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39363] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0235 (0.0235) time: 0.2948 data: 0.2038 max mem: 5716\n","Epoch: [39363] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0233) time: 0.0989 data: 0.0171 max mem: 5716\n","Epoch: [39363] Total time: 0:00:01 (0.1031 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39364] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0250 (0.0250) time: 0.2921 data: 0.2043 max mem: 5716\n","Epoch: [39364] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0219 (0.0229) time: 0.0963 data: 0.0172 max mem: 5716\n","Epoch: [39364] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0219 (0.0229) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39365] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0265 (0.0265) time: 0.2934 data: 0.2047 max mem: 5716\n","Epoch: [39365] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0241) time: 0.0948 data: 0.0172 max mem: 5716\n","Epoch: [39365] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39366] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0229 (0.0229) time: 0.2880 data: 0.1976 max mem: 5716\n","Epoch: [39366] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0243 (0.0244) time: 0.0967 data: 0.0166 max mem: 5716\n","Epoch: [39366] Total time: 0:00:01 (0.1009 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0243 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39367] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0288 (0.0288) time: 0.2969 data: 0.2083 max mem: 5716\n","Epoch: [39367] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0245) time: 0.0966 data: 0.0175 max mem: 5716\n","Epoch: [39367] Total time: 0:00:01 (0.1008 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39368] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0257 (0.0257) time: 0.2980 data: 0.2089 max mem: 5716\n","Epoch: [39368] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0209 (0.0209) time: 0.0967 data: 0.0176 max mem: 5716\n","Epoch: [39368] Total time: 0:00:01 (0.1009 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0209 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39369] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0191 (0.0191) time: 0.2932 data: 0.2062 max mem: 5716\n","Epoch: [39369] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0209 (0.0209) time: 0.0955 data: 0.0173 max mem: 5716\n","Epoch: [39369] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0209 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39370] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0298 (0.0298) time: 0.2908 data: 0.1945 max mem: 5716\n","Epoch: [39370] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0196 (0.0208) time: 0.0949 data: 0.0164 max mem: 5716\n","Epoch: [39370] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0196 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39371] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0256 (0.0256) time: 0.2757 data: 0.1866 max mem: 5716\n","Epoch: [39371] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0216 (0.0228) time: 0.0943 data: 0.0157 max mem: 5716\n","Epoch: [39371] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0216 (0.0228) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39372] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0235 (0.0235) time: 0.2920 data: 0.2027 max mem: 5716\n","Epoch: [39372] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0221 (0.0231) time: 0.0949 data: 0.0170 max mem: 5716\n","Epoch: [39372] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0221 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39373] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0256 (0.0256) time: 0.2886 data: 0.1963 max mem: 5716\n","Epoch: [39373] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0229) time: 0.0962 data: 0.0165 max mem: 5716\n","Epoch: [39373] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0229) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39374] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0261 (0.0261) time: 0.2934 data: 0.2035 max mem: 5716\n","Epoch: [39374] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0253 (0.0245) time: 0.0957 data: 0.0171 max mem: 5716\n","Epoch: [39374] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0253 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39375] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0221 (0.0221) time: 0.2942 data: 0.2067 max mem: 5716\n","Epoch: [39375] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0240) time: 0.0958 data: 0.0174 max mem: 5716\n","Epoch: [39375] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0240) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39376] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0227 (0.0227) time: 0.2841 data: 0.1959 max mem: 5716\n","Epoch: [39376] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0243) time: 0.0946 data: 0.0165 max mem: 5716\n","Epoch: [39376] Total time: 0:00:01 (0.0986 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39377] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0231 (0.0231) time: 0.2962 data: 0.2083 max mem: 5716\n","Epoch: [39377] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0215 (0.0210) time: 0.0962 data: 0.0175 max mem: 5716\n","Epoch: [39377] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0215 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39378] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0308 (0.0308) time: 0.3009 data: 0.1984 max mem: 5716\n","Epoch: [39378] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0194 (0.0207) time: 0.0959 data: 0.0167 max mem: 5716\n","Epoch: [39378] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0194 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39379] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0132 (0.0132) time: 0.2944 data: 0.2063 max mem: 5716\n","Epoch: [39379] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0216 (0.0208) time: 0.0959 data: 0.0174 max mem: 5716\n","Epoch: [39379] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0216 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39380] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0199 (0.0199) time: 0.2951 data: 0.2013 max mem: 5716\n","Epoch: [39380] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0232) time: 0.0959 data: 0.0169 max mem: 5716\n","Epoch: [39380] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39381] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0255 (0.0255) time: 0.2940 data: 0.2067 max mem: 5716\n","Epoch: [39381] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0238) time: 0.0973 data: 0.0174 max mem: 5716\n","Epoch: [39381] Total time: 0:00:01 (0.1016 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0238) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39382] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0206 (0.0206) time: 0.2881 data: 0.1955 max mem: 5716\n","Epoch: [39382] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0231) time: 0.0946 data: 0.0164 max mem: 5716\n","Epoch: [39382] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39383] [ 0/12] eta: 0:00:05 lr: 0.000000 loss: 0.0285 (0.0285) time: 0.4891 data: 0.4065 max mem: 5716\n","Epoch: [39383] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0249 (0.0246) time: 0.1111 data: 0.0340 max mem: 5716\n","Epoch: [39383] Total time: 0:00:01 (0.1152 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0249 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39384] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0204 (0.0204) time: 0.2843 data: 0.1968 max mem: 5716\n","Epoch: [39384] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0254 (0.0242) time: 0.0943 data: 0.0165 max mem: 5716\n","Epoch: [39384] Total time: 0:00:01 (0.0984 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0254 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39385] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0251 (0.0251) time: 0.2860 data: 0.1982 max mem: 5716\n","Epoch: [39385] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0240) time: 0.0944 data: 0.0167 max mem: 5716\n","Epoch: [39385] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0240) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39386] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0254 (0.0254) time: 0.2885 data: 0.2004 max mem: 5716\n","Epoch: [39386] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0203 (0.0209) time: 0.0949 data: 0.0168 max mem: 5716\n","Epoch: [39386] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0203 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39387] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0183 (0.0183) time: 0.2876 data: 0.1983 max mem: 5716\n","Epoch: [39387] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0207) time: 0.0956 data: 0.0167 max mem: 5716\n","Epoch: [39387] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39388] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0201 (0.0201) time: 0.2847 data: 0.1961 max mem: 5716\n","Epoch: [39388] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0210) time: 0.0945 data: 0.0165 max mem: 5716\n","Epoch: [39388] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39389] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0255 (0.0255) time: 0.2927 data: 0.2044 max mem: 5716\n","Epoch: [39389] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0233) time: 0.0953 data: 0.0172 max mem: 5716\n","Epoch: [39389] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39390] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0200 (0.0200) time: 0.2874 data: 0.1964 max mem: 5716\n","Epoch: [39390] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0232) time: 0.0962 data: 0.0165 max mem: 5716\n","Epoch: [39390] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39391] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0242 (0.0242) time: 0.2886 data: 0.2009 max mem: 5716\n","Epoch: [39391] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0233) time: 0.0952 data: 0.0169 max mem: 5716\n","Epoch: [39391] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39392] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0240 (0.0240) time: 0.3101 data: 0.2233 max mem: 5716\n","Epoch: [39392] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0241) time: 0.0970 data: 0.0188 max mem: 5716\n","Epoch: [39392] Total time: 0:00:01 (0.1011 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39393] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0264 (0.0264) time: 0.2939 data: 0.2004 max mem: 5716\n","Epoch: [39393] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0243) time: 0.0974 data: 0.0169 max mem: 5716\n","Epoch: [39393] Total time: 0:00:01 (0.1015 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39394] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0256 (0.0256) time: 0.2986 data: 0.1978 max mem: 5716\n","Epoch: [39394] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0243) time: 0.0962 data: 0.0166 max mem: 5716\n","Epoch: [39394] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39395] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0218 (0.0218) time: 0.3086 data: 0.2095 max mem: 5716\n","Epoch: [39395] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0203 (0.0208) time: 0.0971 data: 0.0176 max mem: 5716\n","Epoch: [39395] Total time: 0:00:01 (0.1012 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0203 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39396] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0165 (0.0165) time: 0.2912 data: 0.2035 max mem: 5716\n","Epoch: [39396] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0207) time: 0.0957 data: 0.0171 max mem: 5716\n","Epoch: [39396] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39397] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0264 (0.0264) time: 0.2861 data: 0.1952 max mem: 5716\n","Epoch: [39397] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0209 (0.0210) time: 0.0952 data: 0.0164 max mem: 5716\n","Epoch: [39397] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0209 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39398] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0209 (0.0209) time: 0.2881 data: 0.1982 max mem: 5716\n","Epoch: [39398] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0216 (0.0234) time: 0.0956 data: 0.0167 max mem: 5716\n","Epoch: [39398] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0216 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39399] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0220 (0.0220) time: 0.2882 data: 0.2008 max mem: 5716\n","Epoch: [39399] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0231) time: 0.0985 data: 0.0169 max mem: 5716\n","Epoch: [39399] Total time: 0:00:01 (0.1026 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39400] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0199 (0.0199) time: 0.2898 data: 0.2029 max mem: 5716\n","Epoch: [39400] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0233) time: 0.0950 data: 0.0170 max mem: 5716\n","Epoch: [39400] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39401] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0237 (0.0237) time: 0.2831 data: 0.1956 max mem: 5716\n","Epoch: [39401] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0243) time: 0.0951 data: 0.0165 max mem: 5716\n","Epoch: [39401] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39402] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0278 (0.0278) time: 0.2805 data: 0.1918 max mem: 5716\n","Epoch: [39402] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0239) time: 0.0945 data: 0.0161 max mem: 5716\n","Epoch: [39402] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0239) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39403] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0235 (0.0235) time: 0.2811 data: 0.1887 max mem: 5716\n","Epoch: [39403] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0244 (0.0245) time: 0.0956 data: 0.0159 max mem: 5716\n","Epoch: [39403] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0244 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39404] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0218 (0.0218) time: 0.2864 data: 0.1977 max mem: 5716\n","Epoch: [39404] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0211 (0.0210) time: 0.0956 data: 0.0166 max mem: 5716\n","Epoch: [39404] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0211 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39405] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0200 (0.0200) time: 0.2925 data: 0.2036 max mem: 5716\n","Epoch: [39405] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0207) time: 0.0977 data: 0.0171 max mem: 5716\n","Epoch: [39405] Total time: 0:00:01 (0.1019 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39406] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0234 (0.0234) time: 0.2931 data: 0.2053 max mem: 5716\n","Epoch: [39406] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0207) time: 0.0968 data: 0.0173 max mem: 5716\n","Epoch: [39406] Total time: 0:00:01 (0.1010 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39407] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0226 (0.0226) time: 0.2947 data: 0.2062 max mem: 5716\n","Epoch: [39407] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0223 (0.0235) time: 0.0963 data: 0.0173 max mem: 5716\n","Epoch: [39407] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0223 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39408] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0209 (0.0209) time: 0.2987 data: 0.1991 max mem: 5716\n","Epoch: [39408] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0232) time: 0.0959 data: 0.0167 max mem: 5716\n","Epoch: [39408] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39409] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0214 (0.0214) time: 0.2844 data: 0.1946 max mem: 5716\n","Epoch: [39409] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0232) time: 0.0942 data: 0.0164 max mem: 5716\n","Epoch: [39409] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39410] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0206 (0.0206) time: 0.2946 data: 0.2026 max mem: 5716\n","Epoch: [39410] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0245 (0.0247) time: 0.0958 data: 0.0170 max mem: 5716\n","Epoch: [39410] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0245 (0.0247) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39411] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0248 (0.0248) time: 0.2897 data: 0.1965 max mem: 5716\n","Epoch: [39411] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0242) time: 0.0976 data: 0.0165 max mem: 5716\n","Epoch: [39411] Total time: 0:00:01 (0.1018 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39412] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0228 (0.0228) time: 0.2908 data: 0.2040 max mem: 5716\n","Epoch: [39412] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0243) time: 0.0953 data: 0.0172 max mem: 5716\n","Epoch: [39412] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39413] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0164 (0.0164) time: 0.2857 data: 0.1967 max mem: 5716\n","Epoch: [39413] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0216 (0.0207) time: 0.0949 data: 0.0165 max mem: 5716\n","Epoch: [39413] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0216 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39414] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0189 (0.0189) time: 0.2989 data: 0.1944 max mem: 5716\n","Epoch: [39414] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0212 (0.0207) time: 0.0957 data: 0.0164 max mem: 5716\n","Epoch: [39414] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0212 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39415] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0183 (0.0183) time: 0.2931 data: 0.1895 max mem: 5716\n","Epoch: [39415] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0206 (0.0208) time: 0.0948 data: 0.0160 max mem: 5716\n","Epoch: [39415] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0206 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39416] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0248 (0.0248) time: 0.2941 data: 0.1972 max mem: 5716\n","Epoch: [39416] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0230) time: 0.0954 data: 0.0166 max mem: 5716\n","Epoch: [39416] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39417] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0253 (0.0253) time: 0.2814 data: 0.1927 max mem: 5716\n","Epoch: [39417] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0233) time: 0.0946 data: 0.0162 max mem: 5716\n","Epoch: [39417] Total time: 0:00:01 (0.0986 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39418] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0272 (0.0272) time: 0.2854 data: 0.1969 max mem: 5716\n","Epoch: [39418] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0231) time: 0.0951 data: 0.0166 max mem: 5716\n","Epoch: [39418] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39419] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0251 (0.0251) time: 0.2974 data: 0.2104 max mem: 5716\n","Epoch: [39419] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0241) time: 0.0957 data: 0.0177 max mem: 5716\n","Epoch: [39419] Total time: 0:00:01 (0.1021 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39420] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0283 (0.0283) time: 0.2871 data: 0.1964 max mem: 5716\n","Epoch: [39420] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0248 (0.0247) time: 0.0950 data: 0.0165 max mem: 5716\n","Epoch: [39420] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0248 (0.0247) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39421] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0231 (0.0231) time: 0.2837 data: 0.1943 max mem: 5716\n","Epoch: [39421] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0239) time: 0.0942 data: 0.0163 max mem: 5716\n","Epoch: [39421] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0239) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39422] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0210 (0.0210) time: 0.2987 data: 0.2024 max mem: 5716\n","Epoch: [39422] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0198 (0.0206) time: 0.0958 data: 0.0170 max mem: 5716\n","Epoch: [39422] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0198 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39423] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0184 (0.0184) time: 0.2824 data: 0.1930 max mem: 5716\n","Epoch: [39423] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0212) time: 0.0943 data: 0.0162 max mem: 5716\n","Epoch: [39423] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0212) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39424] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0200 (0.0200) time: 0.2900 data: 0.1969 max mem: 5716\n","Epoch: [39424] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0211 (0.0210) time: 0.0982 data: 0.0166 max mem: 5716\n","Epoch: [39424] Total time: 0:00:01 (0.1024 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0211 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39425] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0176 (0.0176) time: 0.2919 data: 0.2039 max mem: 5716\n","Epoch: [39425] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0233) time: 0.0951 data: 0.0171 max mem: 5716\n","Epoch: [39425] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39426] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0181 (0.0181) time: 0.2887 data: 0.2021 max mem: 5716\n","Epoch: [39426] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0233) time: 0.0955 data: 0.0170 max mem: 5716\n","Epoch: [39426] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39427] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0221 (0.0221) time: 0.2831 data: 0.1929 max mem: 5716\n","Epoch: [39427] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0232) time: 0.0942 data: 0.0162 max mem: 5716\n","Epoch: [39427] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39428] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0248 (0.0248) time: 0.2871 data: 0.1975 max mem: 5716\n","Epoch: [39428] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0243) time: 0.0947 data: 0.0166 max mem: 5716\n","Epoch: [39428] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39429] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0199 (0.0199) time: 0.2887 data: 0.1985 max mem: 5716\n","Epoch: [39429] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0241) time: 0.0948 data: 0.0167 max mem: 5716\n","Epoch: [39429] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39430] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0223 (0.0223) time: 0.2868 data: 0.1969 max mem: 5716\n","Epoch: [39430] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0245 (0.0243) time: 0.0946 data: 0.0166 max mem: 5716\n","Epoch: [39430] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0245 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39431] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0185 (0.0185) time: 0.3016 data: 0.2137 max mem: 5716\n","Epoch: [39431] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0210) time: 0.0968 data: 0.0180 max mem: 5716\n","Epoch: [39431] Total time: 0:00:01 (0.1011 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39432] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0239 (0.0239) time: 0.3014 data: 0.2115 max mem: 5716\n","Epoch: [39432] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0207) time: 0.0967 data: 0.0178 max mem: 5716\n","Epoch: [39432] Total time: 0:00:01 (0.1008 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39433] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0250 (0.0250) time: 0.2968 data: 0.2085 max mem: 5716\n","Epoch: [39433] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0207) time: 0.0970 data: 0.0175 max mem: 5716\n","Epoch: [39433] Total time: 0:00:01 (0.1012 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39434] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0219 (0.0219) time: 0.2904 data: 0.2022 max mem: 5716\n","Epoch: [39434] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0237) time: 0.0948 data: 0.0170 max mem: 5716\n","Epoch: [39434] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0237) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39435] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0207 (0.0207) time: 0.2852 data: 0.1940 max mem: 5716\n","Epoch: [39435] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0215 (0.0231) time: 0.0944 data: 0.0163 max mem: 5716\n","Epoch: [39435] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0215 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39436] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0205 (0.0205) time: 0.2868 data: 0.1962 max mem: 5716\n","Epoch: [39436] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0230) time: 0.0947 data: 0.0165 max mem: 5716\n","Epoch: [39436] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39437] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0240 (0.0240) time: 0.2872 data: 0.1969 max mem: 5716\n","Epoch: [39437] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0243) time: 0.0947 data: 0.0166 max mem: 5716\n","Epoch: [39437] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39438] [ 0/12] eta: 0:00:05 lr: 0.000000 loss: 0.0249 (0.0249) time: 0.4923 data: 0.4070 max mem: 5716\n","Epoch: [39438] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0241) time: 0.1126 data: 0.0341 max mem: 5716\n","Epoch: [39438] Total time: 0:00:01 (0.1168 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39439] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0291 (0.0291) time: 0.2875 data: 0.1990 max mem: 5716\n","Epoch: [39439] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0246) time: 0.0957 data: 0.0168 max mem: 5716\n","Epoch: [39439] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39440] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0197 (0.0197) time: 0.2973 data: 0.2062 max mem: 5716\n","Epoch: [39440] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0208) time: 0.0974 data: 0.0174 max mem: 5716\n","Epoch: [39440] Total time: 0:00:01 (0.1027 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39441] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0204 (0.0204) time: 0.2853 data: 0.1960 max mem: 5716\n","Epoch: [39441] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0209) time: 0.0957 data: 0.0165 max mem: 5716\n","Epoch: [39441] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39442] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0168 (0.0168) time: 0.2954 data: 0.2081 max mem: 5716\n","Epoch: [39442] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0198 (0.0207) time: 0.0959 data: 0.0175 max mem: 5716\n","Epoch: [39442] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0198 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39443] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0283 (0.0283) time: 0.2917 data: 0.1999 max mem: 5716\n","Epoch: [39443] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0220 (0.0228) time: 0.0983 data: 0.0168 max mem: 5716\n","Epoch: [39443] Total time: 0:00:01 (0.1025 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0220 (0.0228) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39444] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0273 (0.0273) time: 0.2917 data: 0.2049 max mem: 5716\n","Epoch: [39444] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0235) time: 0.0961 data: 0.0172 max mem: 5716\n","Epoch: [39444] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39445] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0291 (0.0291) time: 0.2934 data: 0.2067 max mem: 5716\n","Epoch: [39445] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0214 (0.0229) time: 0.0959 data: 0.0174 max mem: 5716\n","Epoch: [39445] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0214 (0.0229) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39446] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0234 (0.0234) time: 0.2854 data: 0.1955 max mem: 5716\n","Epoch: [39446] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0248) time: 0.0944 data: 0.0164 max mem: 5716\n","Epoch: [39446] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0248) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39447] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0231 (0.0231) time: 0.2967 data: 0.2098 max mem: 5716\n","Epoch: [39447] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0245 (0.0245) time: 0.0957 data: 0.0176 max mem: 5716\n","Epoch: [39447] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0245 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39448] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2931 data: 0.1969 max mem: 5716\n","Epoch: [39448] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0242) time: 0.0990 data: 0.0166 max mem: 5716\n","Epoch: [39448] Total time: 0:00:01 (0.1032 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39449] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0192 (0.0192) time: 0.2835 data: 0.1942 max mem: 5716\n","Epoch: [39449] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0207) time: 0.0955 data: 0.0164 max mem: 5716\n","Epoch: [39449] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39450] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0183 (0.0183) time: 0.2881 data: 0.1972 max mem: 5716\n","Epoch: [39450] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0204 (0.0206) time: 0.0956 data: 0.0166 max mem: 5716\n","Epoch: [39450] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0204 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39451] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0168 (0.0168) time: 0.2900 data: 0.2035 max mem: 5716\n","Epoch: [39451] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0196 (0.0208) time: 0.0951 data: 0.0171 max mem: 5716\n","Epoch: [39451] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0196 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39452] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0218 (0.0218) time: 0.2929 data: 0.2058 max mem: 5716\n","Epoch: [39452] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0220 (0.0229) time: 0.0949 data: 0.0173 max mem: 5716\n","Epoch: [39452] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0220 (0.0229) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39453] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0239 (0.0239) time: 0.2895 data: 0.2025 max mem: 5716\n","Epoch: [39453] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0233) time: 0.0946 data: 0.0170 max mem: 5716\n","Epoch: [39453] Total time: 0:00:01 (0.0986 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39454] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0240 (0.0240) time: 0.2824 data: 0.1924 max mem: 5716\n","Epoch: [39454] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0234) time: 0.0942 data: 0.0162 max mem: 5716\n","Epoch: [39454] Total time: 0:00:01 (0.0982 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39455] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0228 (0.0228) time: 0.2938 data: 0.2055 max mem: 5716\n","Epoch: [39455] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0242) time: 0.0961 data: 0.0173 max mem: 5716\n","Epoch: [39455] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39456] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0251 (0.0251) time: 0.3070 data: 0.2092 max mem: 5716\n","Epoch: [39456] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0250 (0.0246) time: 0.0979 data: 0.0176 max mem: 5716\n","Epoch: [39456] Total time: 0:00:01 (0.1021 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0250 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39457] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0268 (0.0268) time: 0.2921 data: 0.2065 max mem: 5716\n","Epoch: [39457] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0247) time: 0.0950 data: 0.0174 max mem: 5716\n","Epoch: [39457] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0247) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39458] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0261 (0.0261) time: 0.2884 data: 0.1899 max mem: 5716\n","Epoch: [39458] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0199 (0.0209) time: 0.0973 data: 0.0160 max mem: 5716\n","Epoch: [39458] Total time: 0:00:01 (0.1015 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0199 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39459] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0212 (0.0212) time: 0.2852 data: 0.1941 max mem: 5716\n","Epoch: [39459] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0211 (0.0206) time: 0.0941 data: 0.0163 max mem: 5716\n","Epoch: [39459] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0211 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39460] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0201 (0.0201) time: 0.2906 data: 0.1912 max mem: 5716\n","Epoch: [39460] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0210 (0.0209) time: 0.0946 data: 0.0161 max mem: 5716\n","Epoch: [39460] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0210 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39461] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0191 (0.0191) time: 0.2846 data: 0.1954 max mem: 5716\n","Epoch: [39461] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0229) time: 0.0942 data: 0.0164 max mem: 5716\n","Epoch: [39461] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0229) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39462] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0258 (0.0258) time: 0.2953 data: 0.2081 max mem: 5716\n","Epoch: [39462] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0232) time: 0.0956 data: 0.0175 max mem: 5716\n","Epoch: [39462] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39463] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0285 (0.0285) time: 0.2844 data: 0.1962 max mem: 5716\n","Epoch: [39463] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0233) time: 0.0941 data: 0.0165 max mem: 5716\n","Epoch: [39463] Total time: 0:00:01 (0.0982 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39464] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0254 (0.0254) time: 0.2924 data: 0.2066 max mem: 5716\n","Epoch: [39464] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0247 (0.0246) time: 0.0950 data: 0.0174 max mem: 5716\n","Epoch: [39464] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0247 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39465] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0270 (0.0270) time: 0.2877 data: 0.1951 max mem: 5716\n","Epoch: [39465] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0242 (0.0242) time: 0.0949 data: 0.0164 max mem: 5716\n","Epoch: [39465] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0242 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39466] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0230 (0.0230) time: 0.2955 data: 0.2074 max mem: 5716\n","Epoch: [39466] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0243) time: 0.0959 data: 0.0174 max mem: 5716\n","Epoch: [39466] Total time: 0:00:01 (0.1011 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39467] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0181 (0.0181) time: 0.3002 data: 0.1989 max mem: 5716\n","Epoch: [39467] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0215 (0.0205) time: 0.0968 data: 0.0167 max mem: 5716\n","Epoch: [39467] Total time: 0:00:01 (0.1010 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0215 (0.0205) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39468] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0185 (0.0185) time: 0.2852 data: 0.1965 max mem: 5716\n","Epoch: [39468] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0194 (0.0208) time: 0.0958 data: 0.0166 max mem: 5716\n","Epoch: [39468] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0194 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39469] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0195 (0.0195) time: 0.2871 data: 0.1955 max mem: 5716\n","Epoch: [39469] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0207) time: 0.0952 data: 0.0164 max mem: 5716\n","Epoch: [39469] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39470] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0215 (0.0215) time: 0.2899 data: 0.1899 max mem: 5716\n","Epoch: [39470] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0237) time: 0.0950 data: 0.0160 max mem: 5716\n","Epoch: [39470] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0237) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39471] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0215 (0.0215) time: 0.2855 data: 0.1958 max mem: 5716\n","Epoch: [39471] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0230) time: 0.0942 data: 0.0165 max mem: 5716\n","Epoch: [39471] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39472] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0271 (0.0271) time: 0.2892 data: 0.1986 max mem: 5716\n","Epoch: [39472] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0235) time: 0.0958 data: 0.0167 max mem: 5716\n","Epoch: [39472] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39473] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0289 (0.0289) time: 0.2865 data: 0.1976 max mem: 5716\n","Epoch: [39473] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0241) time: 0.0949 data: 0.0166 max mem: 5716\n","Epoch: [39473] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39474] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0270 (0.0270) time: 0.2995 data: 0.1945 max mem: 5716\n","Epoch: [39474] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0245) time: 0.0972 data: 0.0164 max mem: 5716\n","Epoch: [39474] Total time: 0:00:01 (0.1013 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39475] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0287 (0.0287) time: 0.2949 data: 0.1967 max mem: 5716\n","Epoch: [39475] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0243) time: 0.0952 data: 0.0166 max mem: 5716\n","Epoch: [39475] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39476] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0222 (0.0222) time: 0.2892 data: 0.2014 max mem: 5716\n","Epoch: [39476] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0209) time: 0.0955 data: 0.0169 max mem: 5716\n","Epoch: [39476] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39477] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2913 data: 0.2007 max mem: 5716\n","Epoch: [39477] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0206 (0.0206) time: 0.0971 data: 0.0169 max mem: 5716\n","Epoch: [39477] Total time: 0:00:01 (0.1013 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0206 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39478] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0260 (0.0260) time: 0.2858 data: 0.1968 max mem: 5716\n","Epoch: [39478] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0196 (0.0207) time: 0.0944 data: 0.0166 max mem: 5716\n","Epoch: [39478] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0196 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39479] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0245 (0.0245) time: 0.2940 data: 0.2060 max mem: 5716\n","Epoch: [39479] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0233) time: 0.0953 data: 0.0173 max mem: 5716\n","Epoch: [39479] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39480] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2801 data: 0.1919 max mem: 5716\n","Epoch: [39480] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0233) time: 0.0941 data: 0.0161 max mem: 5716\n","Epoch: [39480] Total time: 0:00:01 (0.0982 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39481] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0184 (0.0184) time: 0.2825 data: 0.1912 max mem: 5716\n","Epoch: [39481] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0220 (0.0232) time: 0.0956 data: 0.0161 max mem: 5716\n","Epoch: [39481] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0220 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39482] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0227 (0.0227) time: 0.2909 data: 0.2032 max mem: 5716\n","Epoch: [39482] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0248) time: 0.0957 data: 0.0171 max mem: 5716\n","Epoch: [39482] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0248) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39483] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0266 (0.0266) time: 0.2881 data: 0.1975 max mem: 5716\n","Epoch: [39483] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0250 (0.0245) time: 0.0959 data: 0.0166 max mem: 5716\n","Epoch: [39483] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0250 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39484] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0264 (0.0264) time: 0.2990 data: 0.1967 max mem: 5716\n","Epoch: [39484] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0243) time: 0.0984 data: 0.0166 max mem: 5716\n","Epoch: [39484] Total time: 0:00:01 (0.1025 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39485] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0172 (0.0172) time: 0.2979 data: 0.1983 max mem: 5716\n","Epoch: [39485] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0213 (0.0207) time: 0.0960 data: 0.0167 max mem: 5716\n","Epoch: [39485] Total time: 0:00:01 (0.1021 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0213 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39486] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0229 (0.0229) time: 0.2904 data: 0.1984 max mem: 5716\n","Epoch: [39486] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0204 (0.0211) time: 0.0953 data: 0.0167 max mem: 5716\n","Epoch: [39486] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0204 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39487] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0242 (0.0242) time: 0.2855 data: 0.1962 max mem: 5716\n","Epoch: [39487] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0210 (0.0208) time: 0.0958 data: 0.0165 max mem: 5716\n","Epoch: [39487] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0210 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39488] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0251 (0.0251) time: 0.3007 data: 0.2132 max mem: 5716\n","Epoch: [39488] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0231) time: 0.0966 data: 0.0179 max mem: 5716\n","Epoch: [39488] Total time: 0:00:01 (0.1008 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39489] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0193 (0.0193) time: 0.2895 data: 0.2008 max mem: 5716\n","Epoch: [39489] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0232) time: 0.0953 data: 0.0169 max mem: 5716\n","Epoch: [39489] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39490] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0222 (0.0222) time: 0.2886 data: 0.1991 max mem: 5716\n","Epoch: [39490] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0229) time: 0.0947 data: 0.0167 max mem: 5716\n","Epoch: [39490] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0229) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39491] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0213 (0.0213) time: 0.2959 data: 0.2051 max mem: 5716\n","Epoch: [39491] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0245 (0.0242) time: 0.0955 data: 0.0172 max mem: 5716\n","Epoch: [39491] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0245 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39492] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0249 (0.0249) time: 0.2931 data: 0.2053 max mem: 5716\n","Epoch: [39492] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0251 (0.0250) time: 0.0954 data: 0.0173 max mem: 5716\n","Epoch: [39492] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0251 (0.0250) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39493] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0216 (0.0216) time: 0.3029 data: 0.2124 max mem: 5716\n","Epoch: [39493] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0245) time: 0.0969 data: 0.0179 max mem: 5716\n","Epoch: [39493] Total time: 0:00:01 (0.1026 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39494] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0234 (0.0234) time: 0.2864 data: 0.1963 max mem: 5716\n","Epoch: [39494] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0198 (0.0209) time: 0.0952 data: 0.0165 max mem: 5716\n","Epoch: [39494] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0198 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39495] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0181 (0.0181) time: 0.2902 data: 0.1976 max mem: 5716\n","Epoch: [39495] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0185 (0.0208) time: 0.0951 data: 0.0166 max mem: 5716\n","Epoch: [39495] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0185 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39496] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0217 (0.0217) time: 0.2823 data: 0.1935 max mem: 5716\n","Epoch: [39496] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0204 (0.0211) time: 0.0940 data: 0.0163 max mem: 5716\n","Epoch: [39496] Total time: 0:00:01 (0.0981 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0204 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39497] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0202 (0.0202) time: 0.2934 data: 0.2045 max mem: 5716\n","Epoch: [39497] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0223 (0.0232) time: 0.0955 data: 0.0172 max mem: 5716\n","Epoch: [39497] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0223 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39498] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0192 (0.0192) time: 0.2884 data: 0.2019 max mem: 5716\n","Epoch: [39498] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0228) time: 0.0951 data: 0.0170 max mem: 5716\n","Epoch: [39498] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0228) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39499] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0213 (0.0213) time: 0.2846 data: 0.1943 max mem: 5716\n","Epoch: [39499] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0231) time: 0.0945 data: 0.0164 max mem: 5716\n","Epoch: [39499] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39500] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0283 (0.0283) time: 0.2972 data: 0.2021 max mem: 5716\n","Epoch: [39500] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0247) time: 0.0955 data: 0.0170 max mem: 5716\n","Epoch: [39500] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0247) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39501] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0229 (0.0229) time: 0.2900 data: 0.1951 max mem: 5716\n","Epoch: [39501] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0240) time: 0.0977 data: 0.0164 max mem: 5716\n","Epoch: [39501] Total time: 0:00:01 (0.1018 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0240) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39502] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0267 (0.0267) time: 0.2877 data: 0.1977 max mem: 5716\n","Epoch: [39502] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0252 (0.0247) time: 0.0946 data: 0.0166 max mem: 5716\n","Epoch: [39502] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0252 (0.0247) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39503] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0176 (0.0176) time: 0.2855 data: 0.1939 max mem: 5716\n","Epoch: [39503] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0203 (0.0206) time: 0.0969 data: 0.0163 max mem: 5716\n","Epoch: [39503] Total time: 0:00:01 (0.1010 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0203 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39504] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0184 (0.0184) time: 0.2878 data: 0.1980 max mem: 5716\n","Epoch: [39504] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0222 (0.0209) time: 0.0951 data: 0.0167 max mem: 5716\n","Epoch: [39504] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0222 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39505] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0189 (0.0189) time: 0.2999 data: 0.2067 max mem: 5716\n","Epoch: [39505] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0204 (0.0207) time: 0.0956 data: 0.0174 max mem: 5716\n","Epoch: [39505] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0204 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39506] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0202 (0.0202) time: 0.3105 data: 0.2215 max mem: 5716\n","Epoch: [39506] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0219 (0.0233) time: 0.0988 data: 0.0186 max mem: 5716\n","Epoch: [39506] Total time: 0:00:01 (0.1029 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0219 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39507] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0279 (0.0279) time: 0.2906 data: 0.2001 max mem: 5716\n","Epoch: [39507] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0233) time: 0.0960 data: 0.0169 max mem: 5716\n","Epoch: [39507] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39508] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0196 (0.0196) time: 0.2853 data: 0.1953 max mem: 5716\n","Epoch: [39508] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0209 (0.0233) time: 0.0952 data: 0.0164 max mem: 5716\n","Epoch: [39508] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0209 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39509] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0292 (0.0292) time: 0.2848 data: 0.1961 max mem: 5716\n","Epoch: [39509] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0247 (0.0246) time: 0.0967 data: 0.0165 max mem: 5716\n","Epoch: [39509] Total time: 0:00:01 (0.1022 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0247 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39510] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0211 (0.0211) time: 0.2821 data: 0.1922 max mem: 5716\n","Epoch: [39510] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0238) time: 0.0945 data: 0.0162 max mem: 5716\n","Epoch: [39510] Total time: 0:00:01 (0.0986 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0238) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39511] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0232 (0.0232) time: 0.2877 data: 0.1998 max mem: 5716\n","Epoch: [39511] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0244) time: 0.0949 data: 0.0168 max mem: 5716\n","Epoch: [39511] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39512] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0189 (0.0189) time: 0.2918 data: 0.2032 max mem: 5716\n","Epoch: [39512] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0207) time: 0.0949 data: 0.0171 max mem: 5716\n","Epoch: [39512] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39513] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0180 (0.0180) time: 0.2804 data: 0.1904 max mem: 5716\n","Epoch: [39513] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0206) time: 0.0943 data: 0.0160 max mem: 5716\n","Epoch: [39513] Total time: 0:00:01 (0.0984 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39514] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0185 (0.0185) time: 0.2900 data: 0.2001 max mem: 5716\n","Epoch: [39514] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0209 (0.0209) time: 0.0958 data: 0.0168 max mem: 5716\n","Epoch: [39514] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0209 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39515] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0268 (0.0268) time: 0.2919 data: 0.2061 max mem: 5716\n","Epoch: [39515] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0223 (0.0230) time: 0.0961 data: 0.0174 max mem: 5716\n","Epoch: [39515] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0223 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39516] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0255 (0.0255) time: 0.2966 data: 0.2086 max mem: 5716\n","Epoch: [39516] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0235) time: 0.0975 data: 0.0175 max mem: 5716\n","Epoch: [39516] Total time: 0:00:01 (0.1017 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39517] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0209 (0.0209) time: 0.2840 data: 0.1944 max mem: 5716\n","Epoch: [39517] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0232) time: 0.0944 data: 0.0164 max mem: 5716\n","Epoch: [39517] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39518] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0230 (0.0230) time: 0.3005 data: 0.2123 max mem: 5716\n","Epoch: [39518] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0247 (0.0243) time: 0.0973 data: 0.0179 max mem: 5716\n","Epoch: [39518] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0247 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39519] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0273 (0.0273) time: 0.2910 data: 0.2017 max mem: 5716\n","Epoch: [39519] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0243 (0.0242) time: 0.0965 data: 0.0170 max mem: 5716\n","Epoch: [39519] Total time: 0:00:01 (0.1006 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0243 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39520] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0239 (0.0239) time: 0.3041 data: 0.1996 max mem: 5716\n","Epoch: [39520] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0242) time: 0.0958 data: 0.0168 max mem: 5716\n","Epoch: [39520] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39521] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0187 (0.0187) time: 0.2844 data: 0.1939 max mem: 5716\n","Epoch: [39521] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0189 (0.0207) time: 0.0953 data: 0.0163 max mem: 5716\n","Epoch: [39521] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0189 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39522] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0223 (0.0223) time: 0.2962 data: 0.1997 max mem: 5716\n","Epoch: [39522] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0216 (0.0208) time: 0.0958 data: 0.0168 max mem: 5716\n","Epoch: [39522] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0216 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39523] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0216 (0.0216) time: 0.2877 data: 0.1983 max mem: 5716\n","Epoch: [39523] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0210 (0.0211) time: 0.0953 data: 0.0167 max mem: 5716\n","Epoch: [39523] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0210 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39524] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0262 (0.0262) time: 0.2947 data: 0.2067 max mem: 5716\n","Epoch: [39524] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0233) time: 0.0964 data: 0.0174 max mem: 5716\n","Epoch: [39524] Total time: 0:00:01 (0.1006 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39525] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0234 (0.0234) time: 0.2840 data: 0.1945 max mem: 5716\n","Epoch: [39525] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0234) time: 0.0950 data: 0.0164 max mem: 5716\n","Epoch: [39525] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39526] [ 0/12] eta: 0:00:06 lr: 0.000000 loss: 0.0185 (0.0185) time: 0.5113 data: 0.4237 max mem: 5716\n","Epoch: [39526] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0231) time: 0.1134 data: 0.0355 max mem: 5716\n","Epoch: [39526] Total time: 0:00:01 (0.1176 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39527] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0232 (0.0232) time: 0.2982 data: 0.1986 max mem: 5716\n","Epoch: [39527] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0242) time: 0.0958 data: 0.0167 max mem: 5716\n","Epoch: [39527] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39528] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0230 (0.0230) time: 0.2881 data: 0.2002 max mem: 5716\n","Epoch: [39528] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0242) time: 0.0944 data: 0.0168 max mem: 5716\n","Epoch: [39528] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39529] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0261 (0.0261) time: 0.2851 data: 0.1973 max mem: 5716\n","Epoch: [39529] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0241) time: 0.0942 data: 0.0166 max mem: 5716\n","Epoch: [39529] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39530] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0239 (0.0239) time: 0.2973 data: 0.2092 max mem: 5716\n","Epoch: [39530] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0203 (0.0210) time: 0.0962 data: 0.0176 max mem: 5716\n","Epoch: [39530] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0203 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39531] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0200 (0.0200) time: 0.2914 data: 0.1999 max mem: 5716\n","Epoch: [39531] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0208) time: 0.0955 data: 0.0168 max mem: 5716\n","Epoch: [39531] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39532] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0181 (0.0181) time: 0.2900 data: 0.2032 max mem: 5716\n","Epoch: [39532] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0199 (0.0205) time: 0.0953 data: 0.0171 max mem: 5716\n","Epoch: [39532] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0199 (0.0205) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39533] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0180 (0.0180) time: 0.2881 data: 0.2016 max mem: 5716\n","Epoch: [39533] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0232) time: 0.0949 data: 0.0169 max mem: 5716\n","Epoch: [39533] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39534] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0264 (0.0264) time: 0.2838 data: 0.1942 max mem: 5716\n","Epoch: [39534] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0235) time: 0.0960 data: 0.0163 max mem: 5716\n","Epoch: [39534] Total time: 0:00:01 (0.1013 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39535] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0237 (0.0237) time: 0.2918 data: 0.1983 max mem: 5716\n","Epoch: [39535] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0235) time: 0.0983 data: 0.0167 max mem: 5716\n","Epoch: [39535] Total time: 0:00:01 (0.1024 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39536] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0223 (0.0223) time: 0.2934 data: 0.2059 max mem: 5716\n","Epoch: [39536] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0241) time: 0.0951 data: 0.0173 max mem: 5716\n","Epoch: [39536] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39537] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0209 (0.0209) time: 0.2874 data: 0.1970 max mem: 5716\n","Epoch: [39537] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0242) time: 0.0947 data: 0.0166 max mem: 5716\n","Epoch: [39537] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39538] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0287 (0.0287) time: 0.2865 data: 0.2014 max mem: 5716\n","Epoch: [39538] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0245 (0.0243) time: 0.0942 data: 0.0170 max mem: 5716\n","Epoch: [39538] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0245 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39539] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0243 (0.0243) time: 0.2919 data: 0.1941 max mem: 5716\n","Epoch: [39539] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0205) time: 0.0946 data: 0.0163 max mem: 5716\n","Epoch: [39539] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0205) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39540] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0247 (0.0247) time: 0.2956 data: 0.1979 max mem: 5716\n","Epoch: [39540] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0206 (0.0205) time: 0.0957 data: 0.0166 max mem: 5716\n","Epoch: [39540] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0206 (0.0205) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39541] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0253 (0.0253) time: 0.2842 data: 0.1939 max mem: 5716\n","Epoch: [39541] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0208) time: 0.0950 data: 0.0163 max mem: 5716\n","Epoch: [39541] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39542] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0225 (0.0225) time: 0.2857 data: 0.1939 max mem: 5716\n","Epoch: [39542] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0234) time: 0.0946 data: 0.0163 max mem: 5716\n","Epoch: [39542] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39543] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0234 (0.0234) time: 0.2931 data: 0.1936 max mem: 5716\n","Epoch: [39543] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0231) time: 0.0976 data: 0.0163 max mem: 5716\n","Epoch: [39543] Total time: 0:00:01 (0.1032 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39544] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0217 (0.0217) time: 0.2947 data: 0.2046 max mem: 5716\n","Epoch: [39544] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0247 (0.0236) time: 0.0971 data: 0.0172 max mem: 5716\n","Epoch: [39544] Total time: 0:00:01 (0.1012 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0247 (0.0236) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39545] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0246 (0.0246) time: 0.2965 data: 0.2048 max mem: 5716\n","Epoch: [39545] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0245) time: 0.0956 data: 0.0172 max mem: 5716\n","Epoch: [39545] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39546] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0269 (0.0269) time: 0.2847 data: 0.1973 max mem: 5716\n","Epoch: [39546] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0244) time: 0.0944 data: 0.0166 max mem: 5716\n","Epoch: [39546] Total time: 0:00:01 (0.0984 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39547] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0257 (0.0257) time: 0.2887 data: 0.1989 max mem: 5716\n","Epoch: [39547] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0244) time: 0.0947 data: 0.0167 max mem: 5716\n","Epoch: [39547] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39548] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0177 (0.0177) time: 0.2890 data: 0.1985 max mem: 5716\n","Epoch: [39548] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0211 (0.0209) time: 0.0955 data: 0.0167 max mem: 5716\n","Epoch: [39548] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0211 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39549] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0216 (0.0216) time: 0.2919 data: 0.2045 max mem: 5716\n","Epoch: [39549] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0211) time: 0.0955 data: 0.0172 max mem: 5716\n","Epoch: [39549] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39550] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0237 (0.0237) time: 0.2953 data: 0.2077 max mem: 5716\n","Epoch: [39550] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0198 (0.0207) time: 0.0961 data: 0.0175 max mem: 5716\n","Epoch: [39550] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0198 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39551] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0239 (0.0239) time: 0.2908 data: 0.2007 max mem: 5716\n","Epoch: [39551] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0238) time: 0.0956 data: 0.0169 max mem: 5716\n","Epoch: [39551] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0238) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39552] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0270 (0.0270) time: 0.2869 data: 0.1977 max mem: 5716\n","Epoch: [39552] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0230) time: 0.0947 data: 0.0166 max mem: 5716\n","Epoch: [39552] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39553] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0227 (0.0227) time: 0.2883 data: 0.1990 max mem: 5716\n","Epoch: [39553] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0235) time: 0.0961 data: 0.0167 max mem: 5716\n","Epoch: [39553] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39554] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0248 (0.0248) time: 0.2896 data: 0.2017 max mem: 5716\n","Epoch: [39554] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0246 (0.0245) time: 0.0948 data: 0.0169 max mem: 5716\n","Epoch: [39554] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0246 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39555] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0245 (0.0245) time: 0.2840 data: 0.1902 max mem: 5716\n","Epoch: [39555] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0245 (0.0244) time: 0.0975 data: 0.0160 max mem: 5716\n","Epoch: [39555] Total time: 0:00:01 (0.1016 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0245 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39556] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0206 (0.0206) time: 0.2838 data: 0.1915 max mem: 5716\n","Epoch: [39556] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0243) time: 0.0953 data: 0.0161 max mem: 5716\n","Epoch: [39556] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39557] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0187 (0.0187) time: 0.2873 data: 0.1947 max mem: 5716\n","Epoch: [39557] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0207) time: 0.0960 data: 0.0164 max mem: 5716\n","Epoch: [39557] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39558] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0212 (0.0212) time: 0.2774 data: 0.1874 max mem: 5716\n","Epoch: [39558] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0198 (0.0206) time: 0.0936 data: 0.0158 max mem: 5716\n","Epoch: [39558] Total time: 0:00:01 (0.0977 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0198 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39559] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0205 (0.0205) time: 0.2915 data: 0.2058 max mem: 5716\n","Epoch: [39559] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0207) time: 0.0953 data: 0.0173 max mem: 5716\n","Epoch: [39559] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39560] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0248 (0.0248) time: 0.2922 data: 0.2033 max mem: 5716\n","Epoch: [39560] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0236) time: 0.0961 data: 0.0172 max mem: 5716\n","Epoch: [39560] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0236) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39561] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0216 (0.0216) time: 0.2834 data: 0.1949 max mem: 5716\n","Epoch: [39561] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0232) time: 0.0949 data: 0.0164 max mem: 5716\n","Epoch: [39561] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39562] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0234 (0.0234) time: 0.2949 data: 0.2072 max mem: 5716\n","Epoch: [39562] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0232) time: 0.0976 data: 0.0174 max mem: 5716\n","Epoch: [39562] Total time: 0:00:01 (0.1018 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39563] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0262 (0.0262) time: 0.2903 data: 0.2029 max mem: 5716\n","Epoch: [39563] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0244) time: 0.0951 data: 0.0171 max mem: 5716\n","Epoch: [39563] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39564] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0253 (0.0253) time: 0.2883 data: 0.1937 max mem: 5716\n","Epoch: [39564] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0239) time: 0.0973 data: 0.0163 max mem: 5716\n","Epoch: [39564] Total time: 0:00:01 (0.1015 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0239) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39565] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0256 (0.0256) time: 0.2916 data: 0.1902 max mem: 5716\n","Epoch: [39565] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0241) time: 0.0964 data: 0.0160 max mem: 5716\n","Epoch: [39565] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39566] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0206 (0.0206) time: 0.2875 data: 0.2012 max mem: 5716\n","Epoch: [39566] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0207) time: 0.0952 data: 0.0169 max mem: 5716\n","Epoch: [39566] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39567] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2914 data: 0.2013 max mem: 5716\n","Epoch: [39567] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.0954 data: 0.0169 max mem: 5716\n","Epoch: [39567] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39568] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0187 (0.0187) time: 0.2876 data: 0.1984 max mem: 5716\n","Epoch: [39568] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0211) time: 0.0955 data: 0.0167 max mem: 5716\n","Epoch: [39568] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39569] [ 0/12] eta: 0:00:06 lr: 0.000000 loss: 0.0230 (0.0230) time: 0.5141 data: 0.4305 max mem: 5716\n","Epoch: [39569] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0222 (0.0232) time: 0.1147 data: 0.0360 max mem: 5716\n","Epoch: [39569] Total time: 0:00:01 (0.1187 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0222 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39570] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0233 (0.0233) time: 0.2943 data: 0.1944 max mem: 5716\n","Epoch: [39570] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0235) time: 0.0959 data: 0.0164 max mem: 5716\n","Epoch: [39570] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39571] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0243 (0.0243) time: 0.2841 data: 0.1961 max mem: 5716\n","Epoch: [39571] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0242 (0.0232) time: 0.0943 data: 0.0165 max mem: 5716\n","Epoch: [39571] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0242 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39572] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0276 (0.0276) time: 0.2859 data: 0.1976 max mem: 5716\n","Epoch: [39572] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0243) time: 0.0941 data: 0.0166 max mem: 5716\n","Epoch: [39572] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39573] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0262 (0.0262) time: 0.2899 data: 0.2045 max mem: 5716\n","Epoch: [39573] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0243) time: 0.0947 data: 0.0172 max mem: 5716\n","Epoch: [39573] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39574] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0204 (0.0204) time: 0.2975 data: 0.2106 max mem: 5716\n","Epoch: [39574] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0251 (0.0245) time: 0.0960 data: 0.0177 max mem: 5716\n","Epoch: [39574] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0251 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39575] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0185 (0.0185) time: 0.2838 data: 0.1953 max mem: 5716\n","Epoch: [39575] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0199 (0.0212) time: 0.0942 data: 0.0164 max mem: 5716\n","Epoch: [39575] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0199 (0.0212) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39576] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0198 (0.0198) time: 0.2831 data: 0.1942 max mem: 5716\n","Epoch: [39576] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0196 (0.0204) time: 0.0945 data: 0.0163 max mem: 5716\n","Epoch: [39576] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0196 (0.0204) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39577] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0169 (0.0169) time: 0.2943 data: 0.1956 max mem: 5716\n","Epoch: [39577] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0190 (0.0208) time: 0.0952 data: 0.0164 max mem: 5716\n","Epoch: [39577] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0190 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39578] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0214 (0.0214) time: 0.3043 data: 0.2075 max mem: 5716\n","Epoch: [39578] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0233) time: 0.0995 data: 0.0174 max mem: 5716\n","Epoch: [39578] Total time: 0:00:01 (0.1036 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39579] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0219 (0.0219) time: 0.2877 data: 0.1989 max mem: 5716\n","Epoch: [39579] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0223 (0.0231) time: 0.0950 data: 0.0167 max mem: 5716\n","Epoch: [39579] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0223 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39580] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0247 (0.0247) time: 0.2950 data: 0.2062 max mem: 5716\n","Epoch: [39580] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0243 (0.0235) time: 0.0962 data: 0.0174 max mem: 5716\n","Epoch: [39580] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0243 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39581] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0270 (0.0270) time: 0.2921 data: 0.2024 max mem: 5716\n","Epoch: [39581] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0240) time: 0.0969 data: 0.0170 max mem: 5716\n","Epoch: [39581] Total time: 0:00:01 (0.1010 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0240) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39582] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0271 (0.0271) time: 0.3071 data: 0.2090 max mem: 5716\n","Epoch: [39582] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0243) time: 0.0965 data: 0.0176 max mem: 5716\n","Epoch: [39582] Total time: 0:00:01 (0.1007 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39583] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0248 (0.0248) time: 0.2890 data: 0.1989 max mem: 5716\n","Epoch: [39583] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0243 (0.0245) time: 0.0951 data: 0.0167 max mem: 5716\n","Epoch: [39583] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0243 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39584] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0196 (0.0196) time: 0.2842 data: 0.1917 max mem: 5716\n","Epoch: [39584] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0191 (0.0207) time: 0.0977 data: 0.0161 max mem: 5716\n","Epoch: [39584] Total time: 0:00:01 (0.1018 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0191 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39585] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0224 (0.0224) time: 0.2832 data: 0.1941 max mem: 5716\n","Epoch: [39585] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0210) time: 0.0948 data: 0.0163 max mem: 5716\n","Epoch: [39585] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39586] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0197 (0.0197) time: 0.2905 data: 0.2014 max mem: 5716\n","Epoch: [39586] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0198 (0.0207) time: 0.0952 data: 0.0169 max mem: 5716\n","Epoch: [39586] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0198 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39587] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.2911 data: 0.2027 max mem: 5716\n","Epoch: [39587] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0218 (0.0232) time: 0.0956 data: 0.0171 max mem: 5716\n","Epoch: [39587] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0218 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39588] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.2816 data: 0.1909 max mem: 5716\n","Epoch: [39588] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0232) time: 0.0943 data: 0.0161 max mem: 5716\n","Epoch: [39588] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39589] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0244 (0.0244) time: 0.2797 data: 0.1893 max mem: 5716\n","Epoch: [39589] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0233) time: 0.0941 data: 0.0159 max mem: 5716\n","Epoch: [39589] Total time: 0:00:01 (0.0981 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39590] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0282 (0.0282) time: 0.2860 data: 0.1960 max mem: 5716\n","Epoch: [39590] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0247 (0.0247) time: 0.0950 data: 0.0165 max mem: 5716\n","Epoch: [39590] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0247 (0.0247) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39591] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0233 (0.0233) time: 0.3078 data: 0.1974 max mem: 5716\n","Epoch: [39591] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0241) time: 0.0962 data: 0.0166 max mem: 5716\n","Epoch: [39591] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39592] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0244 (0.0244) time: 0.2874 data: 0.1983 max mem: 5716\n","Epoch: [39592] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0242 (0.0240) time: 0.0953 data: 0.0167 max mem: 5716\n","Epoch: [39592] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0242 (0.0240) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39593] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0245 (0.0245) time: 0.2864 data: 0.1978 max mem: 5716\n","Epoch: [39593] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0185 (0.0204) time: 0.0950 data: 0.0166 max mem: 5716\n","Epoch: [39593] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0185 (0.0204) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39594] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0203 (0.0203) time: 0.2904 data: 0.1999 max mem: 5716\n","Epoch: [39594] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0203 (0.0208) time: 0.0969 data: 0.0168 max mem: 5716\n","Epoch: [39594] Total time: 0:00:01 (0.1010 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0203 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39595] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0265 (0.0265) time: 0.2871 data: 0.1958 max mem: 5716\n","Epoch: [39595] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0209 (0.0208) time: 0.0955 data: 0.0165 max mem: 5716\n","Epoch: [39595] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0209 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39596] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.2910 data: 0.2016 max mem: 5716\n","Epoch: [39596] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0233) time: 0.0953 data: 0.0169 max mem: 5716\n","Epoch: [39596] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39597] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0270 (0.0270) time: 0.2857 data: 0.1940 max mem: 5716\n","Epoch: [39597] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0231) time: 0.0946 data: 0.0163 max mem: 5716\n","Epoch: [39597] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39598] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0198 (0.0198) time: 0.2991 data: 0.2087 max mem: 5716\n","Epoch: [39598] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0233) time: 0.0964 data: 0.0176 max mem: 5716\n","Epoch: [39598] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39599] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0209 (0.0209) time: 0.2880 data: 0.2009 max mem: 5716\n","Epoch: [39599] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0243) time: 0.0945 data: 0.0169 max mem: 5716\n","Epoch: [39599] Total time: 0:00:01 (0.0986 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39600] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0286 (0.0286) time: 0.2891 data: 0.1986 max mem: 5716\n","Epoch: [39600] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0245) time: 0.0949 data: 0.0167 max mem: 5716\n","Epoch: [39600] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39601] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0267 (0.0267) time: 0.2831 data: 0.1938 max mem: 5716\n","Epoch: [39601] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0244 (0.0244) time: 0.0956 data: 0.0163 max mem: 5716\n","Epoch: [39601] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0244 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39602] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0204 (0.0204) time: 0.2934 data: 0.2032 max mem: 5716\n","Epoch: [39602] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0214 (0.0209) time: 0.0958 data: 0.0171 max mem: 5716\n","Epoch: [39602] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0214 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39603] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0198 (0.0198) time: 0.2834 data: 0.1949 max mem: 5716\n","Epoch: [39603] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0198 (0.0206) time: 0.0944 data: 0.0164 max mem: 5716\n","Epoch: [39603] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0198 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39604] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0172 (0.0172) time: 0.2916 data: 0.2015 max mem: 5716\n","Epoch: [39604] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0199 (0.0209) time: 0.0980 data: 0.0169 max mem: 5716\n","Epoch: [39604] Total time: 0:00:01 (0.1022 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0199 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39605] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0197 (0.0197) time: 0.2956 data: 0.1988 max mem: 5716\n","Epoch: [39605] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0231) time: 0.0960 data: 0.0167 max mem: 5716\n","Epoch: [39605] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39606] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0261 (0.0261) time: 0.3007 data: 0.2129 max mem: 5716\n","Epoch: [39606] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0234) time: 0.0966 data: 0.0179 max mem: 5716\n","Epoch: [39606] Total time: 0:00:01 (0.1008 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39607] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0219 (0.0219) time: 0.2966 data: 0.2029 max mem: 5716\n","Epoch: [39607] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0219 (0.0234) time: 0.0957 data: 0.0171 max mem: 5716\n","Epoch: [39607] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0219 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39608] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0273 (0.0273) time: 0.2901 data: 0.2032 max mem: 5716\n","Epoch: [39608] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0239) time: 0.0950 data: 0.0171 max mem: 5716\n","Epoch: [39608] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0239) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39609] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0259 (0.0259) time: 0.2975 data: 0.2095 max mem: 5716\n","Epoch: [39609] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0244) time: 0.0954 data: 0.0176 max mem: 5716\n","Epoch: [39609] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39610] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0271 (0.0271) time: 0.2911 data: 0.2041 max mem: 5716\n","Epoch: [39610] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0242 (0.0247) time: 0.0960 data: 0.0172 max mem: 5716\n","Epoch: [39610] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0242 (0.0247) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39611] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0240 (0.0240) time: 0.2872 data: 0.1974 max mem: 5716\n","Epoch: [39611] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0206) time: 0.0953 data: 0.0166 max mem: 5716\n","Epoch: [39611] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39612] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0184 (0.0184) time: 0.2899 data: 0.1986 max mem: 5716\n","Epoch: [39612] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0196 (0.0206) time: 0.0955 data: 0.0167 max mem: 5716\n","Epoch: [39612] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0196 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39613] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0230 (0.0230) time: 0.2878 data: 0.1984 max mem: 5716\n","Epoch: [39613] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0217 (0.0207) time: 0.0957 data: 0.0167 max mem: 5716\n","Epoch: [39613] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0217 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39614] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0250 (0.0250) time: 0.2950 data: 0.2085 max mem: 5716\n","Epoch: [39614] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0222 (0.0232) time: 0.0951 data: 0.0175 max mem: 5716\n","Epoch: [39614] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0222 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39615] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0260 (0.0260) time: 0.2852 data: 0.1964 max mem: 5716\n","Epoch: [39615] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0235) time: 0.0946 data: 0.0165 max mem: 5716\n","Epoch: [39615] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39616] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0234 (0.0234) time: 0.2976 data: 0.2105 max mem: 5716\n","Epoch: [39616] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0231) time: 0.0958 data: 0.0177 max mem: 5716\n","Epoch: [39616] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39617] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0255 (0.0255) time: 0.2972 data: 0.1945 max mem: 5716\n","Epoch: [39617] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0243) time: 0.0959 data: 0.0164 max mem: 5716\n","Epoch: [39617] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39618] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0233 (0.0233) time: 0.2837 data: 0.1935 max mem: 5716\n","Epoch: [39618] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0240) time: 0.0956 data: 0.0163 max mem: 5716\n","Epoch: [39618] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0240) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39619] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0242 (0.0242) time: 0.2912 data: 0.1983 max mem: 5716\n","Epoch: [39619] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0246) time: 0.0960 data: 0.0167 max mem: 5716\n","Epoch: [39619] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39620] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0295 (0.0295) time: 0.3007 data: 0.2130 max mem: 5716\n","Epoch: [39620] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0210) time: 0.0969 data: 0.0179 max mem: 5716\n","Epoch: [39620] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39621] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0230 (0.0230) time: 0.2875 data: 0.1997 max mem: 5716\n","Epoch: [39621] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0196 (0.0207) time: 0.0967 data: 0.0168 max mem: 5716\n","Epoch: [39621] Total time: 0:00:01 (0.1011 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0196 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39622] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0261 (0.0261) time: 0.2829 data: 0.1915 max mem: 5716\n","Epoch: [39622] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0196 (0.0207) time: 0.0947 data: 0.0161 max mem: 5716\n","Epoch: [39622] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0196 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39623] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2801 data: 0.1887 max mem: 5716\n","Epoch: [39623] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0223 (0.0232) time: 0.0948 data: 0.0159 max mem: 5716\n","Epoch: [39623] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0223 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39624] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0253 (0.0253) time: 0.3055 data: 0.2045 max mem: 5716\n","Epoch: [39624] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0233) time: 0.0959 data: 0.0172 max mem: 5716\n","Epoch: [39624] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39625] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0207 (0.0207) time: 0.2877 data: 0.2003 max mem: 5716\n","Epoch: [39625] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0232) time: 0.0950 data: 0.0168 max mem: 5716\n","Epoch: [39625] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39626] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0244 (0.0244) time: 0.2902 data: 0.2026 max mem: 5716\n","Epoch: [39626] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0242) time: 0.0992 data: 0.0171 max mem: 5716\n","Epoch: [39626] Total time: 0:00:01 (0.1033 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39627] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0207 (0.0207) time: 0.2883 data: 0.1990 max mem: 5716\n","Epoch: [39627] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0243) time: 0.0950 data: 0.0167 max mem: 5716\n","Epoch: [39627] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39628] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0243 (0.0243) time: 0.2917 data: 0.1990 max mem: 5716\n","Epoch: [39628] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0251 (0.0242) time: 0.0982 data: 0.0167 max mem: 5716\n","Epoch: [39628] Total time: 0:00:01 (0.1024 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0251 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39629] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0182 (0.0182) time: 0.2978 data: 0.2100 max mem: 5716\n","Epoch: [39629] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0209) time: 0.0963 data: 0.0177 max mem: 5716\n","Epoch: [39629] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39630] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0221 (0.0221) time: 0.2961 data: 0.2082 max mem: 5716\n","Epoch: [39630] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0210 (0.0209) time: 0.0961 data: 0.0175 max mem: 5716\n","Epoch: [39630] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0210 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39631] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0152 (0.0152) time: 0.2833 data: 0.1930 max mem: 5716\n","Epoch: [39631] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0203 (0.0208) time: 0.0947 data: 0.0162 max mem: 5716\n","Epoch: [39631] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0203 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39632] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0238 (0.0238) time: 0.2972 data: 0.1995 max mem: 5716\n","Epoch: [39632] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0232) time: 0.0958 data: 0.0168 max mem: 5716\n","Epoch: [39632] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39633] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0192 (0.0192) time: 0.2866 data: 0.1957 max mem: 5716\n","Epoch: [39633] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0235) time: 0.0963 data: 0.0165 max mem: 5716\n","Epoch: [39633] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39634] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0264 (0.0264) time: 0.2830 data: 0.1913 max mem: 5716\n","Epoch: [39634] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0229) time: 0.0947 data: 0.0161 max mem: 5716\n","Epoch: [39634] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0229) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39635] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0253 (0.0253) time: 0.2925 data: 0.1969 max mem: 5716\n","Epoch: [39635] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0243) time: 0.0949 data: 0.0166 max mem: 5716\n","Epoch: [39635] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39636] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0234 (0.0234) time: 0.2940 data: 0.2061 max mem: 5716\n","Epoch: [39636] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0238) time: 0.0962 data: 0.0173 max mem: 5716\n","Epoch: [39636] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0238) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39637] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0254 (0.0254) time: 0.3015 data: 0.2018 max mem: 5716\n","Epoch: [39637] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0244) time: 0.0958 data: 0.0170 max mem: 5716\n","Epoch: [39637] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39638] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0231 (0.0231) time: 0.3016 data: 0.2042 max mem: 5716\n","Epoch: [39638] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0189 (0.0208) time: 0.0976 data: 0.0172 max mem: 5716\n","Epoch: [39638] Total time: 0:00:01 (0.1017 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0189 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39639] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0163 (0.0163) time: 0.3245 data: 0.2157 max mem: 5716\n","Epoch: [39639] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0198 (0.0206) time: 0.0987 data: 0.0182 max mem: 5716\n","Epoch: [39639] Total time: 0:00:01 (0.1029 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0198 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39640] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0152 (0.0152) time: 0.2893 data: 0.1987 max mem: 5716\n","Epoch: [39640] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0206 (0.0205) time: 0.0958 data: 0.0167 max mem: 5716\n","Epoch: [39640] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0206 (0.0205) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39641] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0207 (0.0207) time: 0.2865 data: 0.1971 max mem: 5716\n","Epoch: [39641] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0231) time: 0.0947 data: 0.0166 max mem: 5716\n","Epoch: [39641] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39642] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0199 (0.0199) time: 0.2983 data: 0.2084 max mem: 5716\n","Epoch: [39642] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0233) time: 0.0959 data: 0.0175 max mem: 5716\n","Epoch: [39642] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39643] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0214 (0.0214) time: 0.2844 data: 0.1968 max mem: 5716\n","Epoch: [39643] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0219 (0.0232) time: 0.0955 data: 0.0166 max mem: 5716\n","Epoch: [39643] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0219 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39644] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0250 (0.0250) time: 0.2824 data: 0.1912 max mem: 5716\n","Epoch: [39644] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0238) time: 0.0949 data: 0.0161 max mem: 5716\n","Epoch: [39644] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0238) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39645] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0225 (0.0225) time: 0.2914 data: 0.1978 max mem: 5716\n","Epoch: [39645] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0244) time: 0.0954 data: 0.0167 max mem: 5716\n","Epoch: [39645] Total time: 0:00:01 (0.1009 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39646] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0219 (0.0219) time: 0.2968 data: 0.1973 max mem: 5716\n","Epoch: [39646] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0241) time: 0.0953 data: 0.0166 max mem: 5716\n","Epoch: [39646] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39647] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0198 (0.0198) time: 0.2932 data: 0.2046 max mem: 5716\n","Epoch: [39647] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0198 (0.0207) time: 0.0962 data: 0.0172 max mem: 5716\n","Epoch: [39647] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0198 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39648] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0211 (0.0211) time: 0.2938 data: 0.2043 max mem: 5716\n","Epoch: [39648] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0207) time: 0.0953 data: 0.0172 max mem: 5716\n","Epoch: [39648] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39649] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0151 (0.0151) time: 0.2935 data: 0.2056 max mem: 5716\n","Epoch: [39649] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0210 (0.0206) time: 0.0954 data: 0.0173 max mem: 5716\n","Epoch: [39649] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0210 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39650] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2799 data: 0.1897 max mem: 5716\n","Epoch: [39650] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0230) time: 0.0939 data: 0.0160 max mem: 5716\n","Epoch: [39650] Total time: 0:00:01 (0.0979 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39651] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0262 (0.0262) time: 0.2831 data: 0.1922 max mem: 5716\n","Epoch: [39651] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0234) time: 0.0976 data: 0.0162 max mem: 5716\n","Epoch: [39651] Total time: 0:00:01 (0.1018 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39652] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0211 (0.0211) time: 0.2969 data: 0.1957 max mem: 5716\n","Epoch: [39652] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0232) time: 0.0959 data: 0.0165 max mem: 5716\n","Epoch: [39652] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39653] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0256 (0.0256) time: 0.2956 data: 0.2068 max mem: 5716\n","Epoch: [39653] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0244) time: 0.0958 data: 0.0174 max mem: 5716\n","Epoch: [39653] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39654] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0228 (0.0228) time: 0.2945 data: 0.2054 max mem: 5716\n","Epoch: [39654] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0241) time: 0.0954 data: 0.0173 max mem: 5716\n","Epoch: [39654] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39655] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0211 (0.0211) time: 0.2864 data: 0.1937 max mem: 5716\n","Epoch: [39655] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0247 (0.0243) time: 0.0973 data: 0.0163 max mem: 5716\n","Epoch: [39655] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0247 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39656] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0185 (0.0185) time: 0.2887 data: 0.2009 max mem: 5716\n","Epoch: [39656] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0195 (0.0209) time: 0.0951 data: 0.0169 max mem: 5716\n","Epoch: [39656] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0195 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39657] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0228 (0.0228) time: 0.3013 data: 0.2102 max mem: 5716\n","Epoch: [39657] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0203 (0.0206) time: 0.0972 data: 0.0177 max mem: 5716\n","Epoch: [39657] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0203 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39658] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0188 (0.0188) time: 0.2865 data: 0.1972 max mem: 5716\n","Epoch: [39658] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0203) time: 0.0955 data: 0.0166 max mem: 5716\n","Epoch: [39658] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0203) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39659] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0215 (0.0215) time: 0.3044 data: 0.2059 max mem: 5716\n","Epoch: [39659] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0232) time: 0.0969 data: 0.0173 max mem: 5716\n","Epoch: [39659] Total time: 0:00:01 (0.1010 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39660] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0269 (0.0269) time: 0.2952 data: 0.2074 max mem: 5716\n","Epoch: [39660] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0234) time: 0.0959 data: 0.0174 max mem: 5716\n","Epoch: [39660] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39661] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2886 data: 0.1973 max mem: 5716\n","Epoch: [39661] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0231) time: 0.0960 data: 0.0166 max mem: 5716\n","Epoch: [39661] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39662] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0318 (0.0318) time: 0.3002 data: 0.2118 max mem: 5716\n","Epoch: [39662] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0244) time: 0.0973 data: 0.0178 max mem: 5716\n","Epoch: [39662] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39663] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0328 (0.0328) time: 0.2959 data: 0.2076 max mem: 5716\n","Epoch: [39663] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0218 (0.0245) time: 0.0962 data: 0.0175 max mem: 5716\n","Epoch: [39663] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0218 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39664] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0231 (0.0231) time: 0.2967 data: 0.2074 max mem: 5716\n","Epoch: [39664] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0242) time: 0.0996 data: 0.0175 max mem: 5716\n","Epoch: [39664] Total time: 0:00:01 (0.1038 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39665] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.2888 data: 0.1994 max mem: 5716\n","Epoch: [39665] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0198 (0.0206) time: 0.0953 data: 0.0168 max mem: 5716\n","Epoch: [39665] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0198 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39666] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0229 (0.0229) time: 0.2893 data: 0.2014 max mem: 5716\n","Epoch: [39666] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0206) time: 0.0949 data: 0.0169 max mem: 5716\n","Epoch: [39666] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39667] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0181 (0.0181) time: 0.2891 data: 0.1985 max mem: 5716\n","Epoch: [39667] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0211) time: 0.0948 data: 0.0167 max mem: 5716\n","Epoch: [39667] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39668] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0222 (0.0222) time: 0.2832 data: 0.1941 max mem: 5716\n","Epoch: [39668] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0233) time: 0.0941 data: 0.0163 max mem: 5716\n","Epoch: [39668] Total time: 0:00:01 (0.0981 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39669] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0253 (0.0253) time: 0.2909 data: 0.2031 max mem: 5716\n","Epoch: [39669] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0232) time: 0.0951 data: 0.0171 max mem: 5716\n","Epoch: [39669] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39670] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0275 (0.0275) time: 0.2938 data: 0.2068 max mem: 5716\n","Epoch: [39670] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0235) time: 0.0954 data: 0.0174 max mem: 5716\n","Epoch: [39670] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39671] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0239 (0.0239) time: 0.2961 data: 0.2063 max mem: 5716\n","Epoch: [39671] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0246 (0.0245) time: 0.0964 data: 0.0174 max mem: 5716\n","Epoch: [39671] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0246 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39672] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0221 (0.0221) time: 0.2939 data: 0.2071 max mem: 5716\n","Epoch: [39672] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0223 (0.0241) time: 0.0958 data: 0.0174 max mem: 5716\n","Epoch: [39672] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0223 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39673] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0273 (0.0273) time: 0.2889 data: 0.1962 max mem: 5716\n","Epoch: [39673] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0247 (0.0243) time: 0.0988 data: 0.0165 max mem: 5716\n","Epoch: [39673] Total time: 0:00:01 (0.1029 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0247 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39674] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0214 (0.0214) time: 0.2893 data: 0.1971 max mem: 5716\n","Epoch: [39674] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0209) time: 0.0972 data: 0.0166 max mem: 5716\n","Epoch: [39674] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39675] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0196 (0.0196) time: 0.2946 data: 0.2033 max mem: 5716\n","Epoch: [39675] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0197 (0.0204) time: 0.0980 data: 0.0171 max mem: 5716\n","Epoch: [39675] Total time: 0:00:01 (0.1021 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0197 (0.0204) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39676] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0230 (0.0230) time: 0.2895 data: 0.1994 max mem: 5716\n","Epoch: [39676] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0212 (0.0208) time: 0.0964 data: 0.0168 max mem: 5716\n","Epoch: [39676] Total time: 0:00:01 (0.1006 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0212 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39677] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0214 (0.0214) time: 0.2938 data: 0.2041 max mem: 5716\n","Epoch: [39677] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0233) time: 0.0961 data: 0.0171 max mem: 5716\n","Epoch: [39677] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39678] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0255 (0.0255) time: 0.2969 data: 0.2083 max mem: 5716\n","Epoch: [39678] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0232) time: 0.0955 data: 0.0175 max mem: 5716\n","Epoch: [39678] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39679] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0182 (0.0182) time: 0.2844 data: 0.1941 max mem: 5716\n","Epoch: [39679] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0230) time: 0.0951 data: 0.0164 max mem: 5716\n","Epoch: [39679] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39680] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0221 (0.0221) time: 0.2957 data: 0.2081 max mem: 5716\n","Epoch: [39680] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0241) time: 0.0955 data: 0.0175 max mem: 5716\n","Epoch: [39680] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39681] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0253 (0.0253) time: 0.2909 data: 0.2046 max mem: 5716\n","Epoch: [39681] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0242 (0.0244) time: 0.0985 data: 0.0172 max mem: 5716\n","Epoch: [39681] Total time: 0:00:01 (0.1026 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0242 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39682] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0238 (0.0238) time: 0.2903 data: 0.2036 max mem: 5716\n","Epoch: [39682] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0244) time: 0.0951 data: 0.0171 max mem: 5716\n","Epoch: [39682] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39683] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0228 (0.0228) time: 0.2904 data: 0.2025 max mem: 5716\n","Epoch: [39683] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0194 (0.0209) time: 0.0947 data: 0.0170 max mem: 5716\n","Epoch: [39683] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0194 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39684] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0270 (0.0270) time: 0.2939 data: 0.2055 max mem: 5716\n","Epoch: [39684] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0195 (0.0208) time: 0.0954 data: 0.0173 max mem: 5716\n","Epoch: [39684] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0195 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39685] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0162 (0.0162) time: 0.2894 data: 0.2038 max mem: 5716\n","Epoch: [39685] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0197 (0.0209) time: 0.0947 data: 0.0171 max mem: 5716\n","Epoch: [39685] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0197 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39686] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0236 (0.0236) time: 0.2886 data: 0.1974 max mem: 5716\n","Epoch: [39686] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0234) time: 0.0952 data: 0.0166 max mem: 5716\n","Epoch: [39686] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39687] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0247 (0.0247) time: 0.2904 data: 0.2008 max mem: 5716\n","Epoch: [39687] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0233) time: 0.0952 data: 0.0169 max mem: 5716\n","Epoch: [39687] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39688] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0302 (0.0302) time: 0.2967 data: 0.2059 max mem: 5716\n","Epoch: [39688] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0221 (0.0235) time: 0.0965 data: 0.0173 max mem: 5716\n","Epoch: [39688] Total time: 0:00:01 (0.1016 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0221 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39689] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0205 (0.0205) time: 0.2990 data: 0.2124 max mem: 5716\n","Epoch: [39689] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0246 (0.0240) time: 0.0959 data: 0.0179 max mem: 5716\n","Epoch: [39689] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0246 (0.0240) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39690] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0227 (0.0227) time: 0.2829 data: 0.1939 max mem: 5716\n","Epoch: [39690] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0242 (0.0239) time: 0.0947 data: 0.0163 max mem: 5716\n","Epoch: [39690] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0242 (0.0239) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39691] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0229 (0.0229) time: 0.2840 data: 0.1935 max mem: 5716\n","Epoch: [39691] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0245) time: 0.0943 data: 0.0163 max mem: 5716\n","Epoch: [39691] Total time: 0:00:01 (0.0984 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39692] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0224 (0.0224) time: 0.2939 data: 0.2021 max mem: 5716\n","Epoch: [39692] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0204 (0.0208) time: 0.0954 data: 0.0170 max mem: 5716\n","Epoch: [39692] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0204 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39693] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0188 (0.0188) time: 0.2904 data: 0.2037 max mem: 5716\n","Epoch: [39693] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0198 (0.0209) time: 0.0955 data: 0.0171 max mem: 5716\n","Epoch: [39693] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0198 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39694] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0181 (0.0181) time: 0.2954 data: 0.1961 max mem: 5716\n","Epoch: [39694] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0207) time: 0.0986 data: 0.0165 max mem: 5716\n","Epoch: [39694] Total time: 0:00:01 (0.1027 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39695] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0234 (0.0234) time: 0.2892 data: 0.1948 max mem: 5716\n","Epoch: [39695] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0231) time: 0.0964 data: 0.0164 max mem: 5716\n","Epoch: [39695] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39696] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0262 (0.0262) time: 0.3051 data: 0.2088 max mem: 5716\n","Epoch: [39696] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0230) time: 0.0970 data: 0.0176 max mem: 5716\n","Epoch: [39696] Total time: 0:00:01 (0.1012 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39697] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0204 (0.0204) time: 0.2916 data: 0.1994 max mem: 5716\n","Epoch: [39697] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0234) time: 0.0965 data: 0.0168 max mem: 5716\n","Epoch: [39697] Total time: 0:00:01 (0.1007 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39698] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0264 (0.0264) time: 0.2841 data: 0.1915 max mem: 5716\n","Epoch: [39698] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0247 (0.0242) time: 0.0965 data: 0.0161 max mem: 5716\n","Epoch: [39698] Total time: 0:00:01 (0.1006 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0247 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39699] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0298 (0.0298) time: 0.2849 data: 0.1960 max mem: 5716\n","Epoch: [39699] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0243) time: 0.0969 data: 0.0165 max mem: 5716\n","Epoch: [39699] Total time: 0:00:01 (0.1012 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39700] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0279 (0.0279) time: 0.2898 data: 0.1989 max mem: 5716\n","Epoch: [39700] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0242) time: 0.0961 data: 0.0167 max mem: 5716\n","Epoch: [39700] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39701] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0190 (0.0190) time: 0.2902 data: 0.2009 max mem: 5716\n","Epoch: [39701] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0210 (0.0208) time: 0.0954 data: 0.0169 max mem: 5716\n","Epoch: [39701] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0210 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39702] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0172 (0.0172) time: 0.2858 data: 0.1960 max mem: 5716\n","Epoch: [39702] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0208) time: 0.0942 data: 0.0165 max mem: 5716\n","Epoch: [39702] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39703] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0152 (0.0152) time: 0.2894 data: 0.2024 max mem: 5716\n","Epoch: [39703] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0218 (0.0209) time: 0.0948 data: 0.0170 max mem: 5716\n","Epoch: [39703] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0218 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39704] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0252 (0.0252) time: 0.2990 data: 0.2100 max mem: 5716\n","Epoch: [39704] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0230) time: 0.0992 data: 0.0177 max mem: 5716\n","Epoch: [39704] Total time: 0:00:01 (0.1033 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39705] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.2920 data: 0.2011 max mem: 5716\n","Epoch: [39705] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0234) time: 0.0960 data: 0.0169 max mem: 5716\n","Epoch: [39705] Total time: 0:00:01 (0.1006 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39706] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0209 (0.0209) time: 0.2893 data: 0.1993 max mem: 5716\n","Epoch: [39706] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0220 (0.0231) time: 0.0955 data: 0.0168 max mem: 5716\n","Epoch: [39706] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0220 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39707] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0220 (0.0220) time: 0.3046 data: 0.2029 max mem: 5716\n","Epoch: [39707] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0243 (0.0242) time: 0.0963 data: 0.0171 max mem: 5716\n","Epoch: [39707] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0243 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39708] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0218 (0.0218) time: 0.2901 data: 0.1892 max mem: 5716\n","Epoch: [39708] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0239) time: 0.0988 data: 0.0159 max mem: 5716\n","Epoch: [39708] Total time: 0:00:01 (0.1028 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0239) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39709] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0244 (0.0244) time: 0.2841 data: 0.1940 max mem: 5716\n","Epoch: [39709] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0244 (0.0242) time: 0.0954 data: 0.0163 max mem: 5716\n","Epoch: [39709] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0244 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39710] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0194 (0.0194) time: 0.3031 data: 0.2054 max mem: 5716\n","Epoch: [39710] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0194 (0.0206) time: 0.0964 data: 0.0173 max mem: 5716\n","Epoch: [39710] Total time: 0:00:01 (0.1019 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0194 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39711] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0175 (0.0175) time: 0.2854 data: 0.1956 max mem: 5716\n","Epoch: [39711] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0206) time: 0.0948 data: 0.0164 max mem: 5716\n","Epoch: [39711] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39712] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0179 (0.0179) time: 0.2990 data: 0.1987 max mem: 5716\n","Epoch: [39712] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0193 (0.0204) time: 0.0958 data: 0.0167 max mem: 5716\n","Epoch: [39712] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0193 (0.0204) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39713] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0193 (0.0193) time: 0.2855 data: 0.1961 max mem: 5716\n","Epoch: [39713] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0218 (0.0233) time: 0.0947 data: 0.0165 max mem: 5716\n","Epoch: [39713] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0218 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39714] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0249 (0.0249) time: 0.2969 data: 0.1977 max mem: 5716\n","Epoch: [39714] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0223 (0.0234) time: 0.0966 data: 0.0166 max mem: 5716\n","Epoch: [39714] Total time: 0:00:01 (0.1021 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0223 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39715] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0269 (0.0269) time: 0.2939 data: 0.1968 max mem: 5716\n","Epoch: [39715] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0234) time: 0.0960 data: 0.0166 max mem: 5716\n","Epoch: [39715] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39716] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0244 (0.0244) time: 0.2845 data: 0.1938 max mem: 5716\n","Epoch: [39716] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0241) time: 0.0954 data: 0.0163 max mem: 5716\n","Epoch: [39716] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39717] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0276 (0.0276) time: 0.2920 data: 0.2050 max mem: 5716\n","Epoch: [39717] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0243) time: 0.0952 data: 0.0172 max mem: 5716\n","Epoch: [39717] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39718] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0260 (0.0260) time: 0.2833 data: 0.1916 max mem: 5716\n","Epoch: [39718] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0251 (0.0245) time: 0.0941 data: 0.0161 max mem: 5716\n","Epoch: [39718] Total time: 0:00:01 (0.0982 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0251 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39719] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0186 (0.0186) time: 0.2947 data: 0.2062 max mem: 5716\n","Epoch: [39719] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0206 (0.0206) time: 0.0949 data: 0.0173 max mem: 5716\n","Epoch: [39719] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0206 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39720] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0207 (0.0207) time: 0.2903 data: 0.1981 max mem: 5716\n","Epoch: [39720] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0206) time: 0.0949 data: 0.0167 max mem: 5716\n","Epoch: [39720] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39721] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2921 data: 0.2040 max mem: 5716\n","Epoch: [39721] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0197 (0.0208) time: 0.0980 data: 0.0171 max mem: 5716\n","Epoch: [39721] Total time: 0:00:01 (0.1021 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0197 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39722] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0240 (0.0240) time: 0.2921 data: 0.2042 max mem: 5716\n","Epoch: [39722] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0230) time: 0.0953 data: 0.0172 max mem: 5716\n","Epoch: [39722] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39723] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0272 (0.0272) time: 0.2924 data: 0.2047 max mem: 5716\n","Epoch: [39723] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0234) time: 0.0961 data: 0.0172 max mem: 5716\n","Epoch: [39723] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39724] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0254 (0.0254) time: 0.2875 data: 0.1942 max mem: 5716\n","Epoch: [39724] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0232) time: 0.0950 data: 0.0163 max mem: 5716\n","Epoch: [39724] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39725] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0245 (0.0245) time: 0.2923 data: 0.2036 max mem: 5716\n","Epoch: [39725] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0245 (0.0243) time: 0.0981 data: 0.0171 max mem: 5716\n","Epoch: [39725] Total time: 0:00:01 (0.1024 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0245 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39726] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0181 (0.0181) time: 0.2983 data: 0.1940 max mem: 5716\n","Epoch: [39726] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0247 (0.0243) time: 0.0950 data: 0.0163 max mem: 5716\n","Epoch: [39726] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0247 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39727] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0235 (0.0235) time: 0.2833 data: 0.1955 max mem: 5716\n","Epoch: [39727] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0244) time: 0.0943 data: 0.0164 max mem: 5716\n","Epoch: [39727] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39728] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0227 (0.0227) time: 0.2855 data: 0.1954 max mem: 5716\n","Epoch: [39728] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0203 (0.0211) time: 0.0943 data: 0.0164 max mem: 5716\n","Epoch: [39728] Total time: 0:00:01 (0.0984 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0203 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39729] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0218 (0.0218) time: 0.2957 data: 0.1960 max mem: 5716\n","Epoch: [39729] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0199 (0.0207) time: 0.0951 data: 0.0165 max mem: 5716\n","Epoch: [39729] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0199 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39730] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0186 (0.0186) time: 0.3024 data: 0.2060 max mem: 5716\n","Epoch: [39730] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0206) time: 0.0956 data: 0.0173 max mem: 5716\n","Epoch: [39730] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39731] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0249 (0.0249) time: 0.2802 data: 0.1884 max mem: 5716\n","Epoch: [39731] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0232) time: 0.0955 data: 0.0158 max mem: 5716\n","Epoch: [39731] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39732] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0192 (0.0192) time: 0.2935 data: 0.2056 max mem: 5716\n","Epoch: [39732] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0233) time: 0.0972 data: 0.0173 max mem: 5716\n","Epoch: [39732] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39733] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0233 (0.0233) time: 0.3096 data: 0.2172 max mem: 5716\n","Epoch: [39733] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0228) time: 0.0964 data: 0.0183 max mem: 5716\n","Epoch: [39733] Total time: 0:00:01 (0.1009 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0228) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39734] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0284 (0.0284) time: 0.2859 data: 0.1979 max mem: 5716\n","Epoch: [39734] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0242) time: 0.0945 data: 0.0166 max mem: 5716\n","Epoch: [39734] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39735] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0214 (0.0214) time: 0.2857 data: 0.1962 max mem: 5716\n","Epoch: [39735] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0242 (0.0244) time: 0.0946 data: 0.0165 max mem: 5716\n","Epoch: [39735] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0242 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39736] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0206 (0.0206) time: 0.2997 data: 0.1992 max mem: 5716\n","Epoch: [39736] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0242) time: 0.0960 data: 0.0168 max mem: 5716\n","Epoch: [39736] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39737] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0175 (0.0175) time: 0.2874 data: 0.1958 max mem: 5716\n","Epoch: [39737] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0212 (0.0204) time: 0.0967 data: 0.0164 max mem: 5716\n","Epoch: [39737] Total time: 0:00:01 (0.1008 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0212 (0.0204) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39738] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0219 (0.0219) time: 0.2814 data: 0.1919 max mem: 5716\n","Epoch: [39738] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0206 (0.0209) time: 0.0940 data: 0.0161 max mem: 5716\n","Epoch: [39738] Total time: 0:00:01 (0.0981 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0206 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39739] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0242 (0.0242) time: 0.2812 data: 0.1917 max mem: 5716\n","Epoch: [39739] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0197 (0.0208) time: 0.0942 data: 0.0161 max mem: 5716\n","Epoch: [39739] Total time: 0:00:01 (0.0982 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0197 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39740] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0232 (0.0232) time: 0.2911 data: 0.1927 max mem: 5716\n","Epoch: [39740] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0231) time: 0.0946 data: 0.0162 max mem: 5716\n","Epoch: [39740] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39741] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0267 (0.0267) time: 0.2792 data: 0.1893 max mem: 5716\n","Epoch: [39741] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0232) time: 0.0952 data: 0.0160 max mem: 5716\n","Epoch: [39741] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39742] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0211 (0.0211) time: 0.2893 data: 0.2008 max mem: 5716\n","Epoch: [39742] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0221 (0.0234) time: 0.0950 data: 0.0169 max mem: 5716\n","Epoch: [39742] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0221 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39743] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0276 (0.0276) time: 0.2927 data: 0.2046 max mem: 5716\n","Epoch: [39743] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0242) time: 0.0956 data: 0.0172 max mem: 5716\n","Epoch: [39743] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39744] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0238 (0.0238) time: 0.2901 data: 0.2000 max mem: 5716\n","Epoch: [39744] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0243) time: 0.0949 data: 0.0168 max mem: 5716\n","Epoch: [39744] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39745] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0244 (0.0244) time: 0.2907 data: 0.2047 max mem: 5716\n","Epoch: [39745] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0243) time: 0.0951 data: 0.0172 max mem: 5716\n","Epoch: [39745] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39746] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0161 (0.0161) time: 0.2854 data: 0.1946 max mem: 5716\n","Epoch: [39746] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0203 (0.0205) time: 0.0949 data: 0.0164 max mem: 5716\n","Epoch: [39746] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0203 (0.0205) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39747] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0200 (0.0200) time: 0.2898 data: 0.2018 max mem: 5716\n","Epoch: [39747] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0211) time: 0.0948 data: 0.0170 max mem: 5716\n","Epoch: [39747] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39748] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0170 (0.0170) time: 0.2962 data: 0.2072 max mem: 5716\n","Epoch: [39748] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0211) time: 0.0971 data: 0.0174 max mem: 5716\n","Epoch: [39748] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39749] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.2832 data: 0.1927 max mem: 5716\n","Epoch: [39749] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0234) time: 0.0951 data: 0.0162 max mem: 5716\n","Epoch: [39749] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39750] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0287 (0.0287) time: 0.2830 data: 0.1933 max mem: 5716\n","Epoch: [39750] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0221 (0.0232) time: 0.0945 data: 0.0163 max mem: 5716\n","Epoch: [39750] Total time: 0:00:01 (0.0986 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0221 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39751] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0226 (0.0226) time: 0.2798 data: 0.1910 max mem: 5716\n","Epoch: [39751] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0232) time: 0.0940 data: 0.0161 max mem: 5716\n","Epoch: [39751] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39752] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0240 (0.0240) time: 0.2834 data: 0.1944 max mem: 5716\n","Epoch: [39752] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0238) time: 0.0950 data: 0.0163 max mem: 5716\n","Epoch: [39752] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0238) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39753] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0220 (0.0220) time: 0.2859 data: 0.1993 max mem: 5716\n","Epoch: [39753] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0241) time: 0.0947 data: 0.0167 max mem: 5716\n","Epoch: [39753] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39754] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0195 (0.0195) time: 0.2872 data: 0.1993 max mem: 5716\n","Epoch: [39754] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0251 (0.0243) time: 0.0947 data: 0.0168 max mem: 5716\n","Epoch: [39754] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0251 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39755] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.2929 data: 0.2063 max mem: 5716\n","Epoch: [39755] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0214 (0.0208) time: 0.0979 data: 0.0173 max mem: 5716\n","Epoch: [39755] Total time: 0:00:01 (0.1021 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0214 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39756] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0238 (0.0238) time: 0.2837 data: 0.1944 max mem: 5716\n","Epoch: [39756] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0203 (0.0210) time: 0.0944 data: 0.0163 max mem: 5716\n","Epoch: [39756] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0203 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39757] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0228 (0.0228) time: 0.2933 data: 0.2003 max mem: 5716\n","Epoch: [39757] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0210 (0.0204) time: 0.0954 data: 0.0168 max mem: 5716\n","Epoch: [39757] Total time: 0:00:01 (0.1010 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0210 (0.0204) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39758] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0185 (0.0185) time: 0.2925 data: 0.2044 max mem: 5716\n","Epoch: [39758] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0234) time: 0.0962 data: 0.0172 max mem: 5716\n","Epoch: [39758] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39759] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0215 (0.0215) time: 0.2999 data: 0.2001 max mem: 5716\n","Epoch: [39759] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0233) time: 0.0969 data: 0.0168 max mem: 5716\n","Epoch: [39759] Total time: 0:00:01 (0.1011 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39760] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0277 (0.0277) time: 0.2915 data: 0.2040 max mem: 5716\n","Epoch: [39760] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0232) time: 0.0959 data: 0.0172 max mem: 5716\n","Epoch: [39760] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39761] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0226 (0.0226) time: 0.2948 data: 0.1937 max mem: 5716\n","Epoch: [39761] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0244 (0.0243) time: 0.0962 data: 0.0163 max mem: 5716\n","Epoch: [39761] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0244 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39762] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0216 (0.0216) time: 0.3055 data: 0.2035 max mem: 5716\n","Epoch: [39762] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0245) time: 0.0984 data: 0.0171 max mem: 5716\n","Epoch: [39762] Total time: 0:00:01 (0.1025 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39763] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0220 (0.0220) time: 0.2832 data: 0.1936 max mem: 5716\n","Epoch: [39763] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0237) time: 0.0958 data: 0.0163 max mem: 5716\n","Epoch: [39763] Total time: 0:00:01 (0.1018 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0237) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39764] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0189 (0.0189) time: 0.2862 data: 0.1921 max mem: 5716\n","Epoch: [39764] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0209 (0.0207) time: 0.0943 data: 0.0161 max mem: 5716\n","Epoch: [39764] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0209 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39765] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0173 (0.0173) time: 0.2922 data: 0.2043 max mem: 5716\n","Epoch: [39765] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0196 (0.0208) time: 0.0965 data: 0.0172 max mem: 5716\n","Epoch: [39765] Total time: 0:00:01 (0.1006 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0196 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39766] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0223 (0.0223) time: 0.2860 data: 0.1958 max mem: 5716\n","Epoch: [39766] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0204 (0.0207) time: 0.0953 data: 0.0165 max mem: 5716\n","Epoch: [39766] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0204 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39767] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0183 (0.0183) time: 0.2898 data: 0.1956 max mem: 5716\n","Epoch: [39767] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0213 (0.0233) time: 0.0971 data: 0.0165 max mem: 5716\n","Epoch: [39767] Total time: 0:00:01 (0.1012 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0213 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39768] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0231 (0.0231) time: 0.2861 data: 0.1952 max mem: 5716\n","Epoch: [39768] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0232) time: 0.0953 data: 0.0164 max mem: 5716\n","Epoch: [39768] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39769] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0183 (0.0183) time: 0.2871 data: 0.1974 max mem: 5716\n","Epoch: [39769] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0231) time: 0.0953 data: 0.0166 max mem: 5716\n","Epoch: [39769] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39770] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0222 (0.0222) time: 0.2891 data: 0.1982 max mem: 5716\n","Epoch: [39770] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0243 (0.0244) time: 0.0955 data: 0.0167 max mem: 5716\n","Epoch: [39770] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0243 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39771] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0211 (0.0211) time: 0.2968 data: 0.2056 max mem: 5716\n","Epoch: [39771] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0245) time: 0.0959 data: 0.0173 max mem: 5716\n","Epoch: [39771] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39772] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0239 (0.0239) time: 0.2873 data: 0.1981 max mem: 5716\n","Epoch: [39772] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0240) time: 0.0947 data: 0.0166 max mem: 5716\n","Epoch: [39772] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0240) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39773] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0195 (0.0195) time: 0.2903 data: 0.1997 max mem: 5716\n","Epoch: [39773] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0209) time: 0.0954 data: 0.0168 max mem: 5716\n","Epoch: [39773] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39774] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0195 (0.0195) time: 0.2829 data: 0.1906 max mem: 5716\n","Epoch: [39774] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0210 (0.0209) time: 0.0947 data: 0.0160 max mem: 5716\n","Epoch: [39774] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0210 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39775] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0191 (0.0191) time: 0.2815 data: 0.1891 max mem: 5716\n","Epoch: [39775] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0211) time: 0.0972 data: 0.0159 max mem: 5716\n","Epoch: [39775] Total time: 0:00:01 (0.1013 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39776] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0232 (0.0232) time: 0.2881 data: 0.2005 max mem: 5716\n","Epoch: [39776] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0230) time: 0.0948 data: 0.0169 max mem: 5716\n","Epoch: [39776] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39777] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0186 (0.0186) time: 0.2877 data: 0.1999 max mem: 5716\n","Epoch: [39777] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0234) time: 0.0947 data: 0.0168 max mem: 5716\n","Epoch: [39777] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39778] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0242 (0.0242) time: 0.2887 data: 0.1970 max mem: 5716\n","Epoch: [39778] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0230) time: 0.0951 data: 0.0166 max mem: 5716\n","Epoch: [39778] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39779] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0282 (0.0282) time: 0.2965 data: 0.2087 max mem: 5716\n","Epoch: [39779] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0239) time: 0.0960 data: 0.0176 max mem: 5716\n","Epoch: [39779] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0239) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39780] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0191 (0.0191) time: 0.2951 data: 0.2076 max mem: 5716\n","Epoch: [39780] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0244) time: 0.0958 data: 0.0175 max mem: 5716\n","Epoch: [39780] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39781] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0235 (0.0235) time: 0.2925 data: 0.1934 max mem: 5716\n","Epoch: [39781] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0242 (0.0244) time: 0.0955 data: 0.0163 max mem: 5716\n","Epoch: [39781] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0242 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39782] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0155 (0.0155) time: 0.2866 data: 0.1912 max mem: 5716\n","Epoch: [39782] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0208) time: 0.0950 data: 0.0161 max mem: 5716\n","Epoch: [39782] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39783] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0192 (0.0192) time: 0.2842 data: 0.1911 max mem: 5716\n","Epoch: [39783] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0197 (0.0210) time: 0.0972 data: 0.0161 max mem: 5716\n","Epoch: [39783] Total time: 0:00:01 (0.1013 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0197 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39784] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0186 (0.0186) time: 0.2869 data: 0.1986 max mem: 5716\n","Epoch: [39784] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0207) time: 0.0955 data: 0.0167 max mem: 5716\n","Epoch: [39784] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39785] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0260 (0.0260) time: 0.2774 data: 0.1880 max mem: 5716\n","Epoch: [39785] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0245 (0.0237) time: 0.0937 data: 0.0158 max mem: 5716\n","Epoch: [39785] Total time: 0:00:01 (0.0978 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0245 (0.0237) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39786] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0257 (0.0257) time: 0.2872 data: 0.1961 max mem: 5716\n","Epoch: [39786] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0229) time: 0.0946 data: 0.0165 max mem: 5716\n","Epoch: [39786] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0229) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39787] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0221 (0.0221) time: 0.2896 data: 0.1987 max mem: 5716\n","Epoch: [39787] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0235) time: 0.0950 data: 0.0167 max mem: 5716\n","Epoch: [39787] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39788] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0210 (0.0210) time: 0.2857 data: 0.1972 max mem: 5716\n","Epoch: [39788] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0242) time: 0.0946 data: 0.0166 max mem: 5716\n","Epoch: [39788] Total time: 0:00:01 (0.0986 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39789] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0205 (0.0205) time: 0.2980 data: 0.1985 max mem: 5716\n","Epoch: [39789] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0240) time: 0.0962 data: 0.0167 max mem: 5716\n","Epoch: [39789] Total time: 0:00:01 (0.1019 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0240) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39790] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0223 (0.0223) time: 0.2838 data: 0.1941 max mem: 5716\n","Epoch: [39790] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0244) time: 0.0962 data: 0.0163 max mem: 5716\n","Epoch: [39790] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39791] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0232 (0.0232) time: 0.2992 data: 0.2031 max mem: 5716\n","Epoch: [39791] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0209) time: 0.0958 data: 0.0171 max mem: 5716\n","Epoch: [39791] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39792] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0289 (0.0289) time: 0.2848 data: 0.1952 max mem: 5716\n","Epoch: [39792] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0210) time: 0.0949 data: 0.0164 max mem: 5716\n","Epoch: [39792] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39793] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0161 (0.0161) time: 0.2850 data: 0.1947 max mem: 5716\n","Epoch: [39793] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0206 (0.0209) time: 0.0940 data: 0.0164 max mem: 5716\n","Epoch: [39793] Total time: 0:00:01 (0.0981 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0206 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39794] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.2865 data: 0.2002 max mem: 5716\n","Epoch: [39794] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0233) time: 0.0944 data: 0.0168 max mem: 5716\n","Epoch: [39794] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39795] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0245 (0.0245) time: 0.2807 data: 0.1899 max mem: 5716\n","Epoch: [39795] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0230) time: 0.0941 data: 0.0160 max mem: 5716\n","Epoch: [39795] Total time: 0:00:01 (0.0981 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39796] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0225 (0.0225) time: 0.2820 data: 0.1912 max mem: 5716\n","Epoch: [39796] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0232) time: 0.0940 data: 0.0161 max mem: 5716\n","Epoch: [39796] Total time: 0:00:01 (0.0980 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39797] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0272 (0.0272) time: 0.2777 data: 0.1879 max mem: 5716\n","Epoch: [39797] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0244) time: 0.0942 data: 0.0158 max mem: 5716\n","Epoch: [39797] Total time: 0:00:01 (0.0982 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39798] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0237 (0.0237) time: 0.2971 data: 0.2022 max mem: 5716\n","Epoch: [39798] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0242) time: 0.0959 data: 0.0170 max mem: 5716\n","Epoch: [39798] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39799] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0246 (0.0246) time: 0.2922 data: 0.2036 max mem: 5716\n","Epoch: [39799] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0243 (0.0241) time: 0.0961 data: 0.0171 max mem: 5716\n","Epoch: [39799] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0243 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39800] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0218 (0.0218) time: 0.2885 data: 0.1960 max mem: 5716\n","Epoch: [39800] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0218 (0.0212) time: 0.0977 data: 0.0165 max mem: 5716\n","Epoch: [39800] Total time: 0:00:01 (0.1018 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0218 (0.0212) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39801] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0177 (0.0177) time: 0.3013 data: 0.2003 max mem: 5716\n","Epoch: [39801] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0206) time: 0.0970 data: 0.0169 max mem: 5716\n","Epoch: [39801] Total time: 0:00:01 (0.1011 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39802] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0227 (0.0227) time: 0.2892 data: 0.1984 max mem: 5716\n","Epoch: [39802] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0199 (0.0205) time: 0.0953 data: 0.0167 max mem: 5716\n","Epoch: [39802] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0199 (0.0205) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39803] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0213 (0.0213) time: 0.2857 data: 0.1972 max mem: 5716\n","Epoch: [39803] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0232) time: 0.0949 data: 0.0166 max mem: 5716\n","Epoch: [39803] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39804] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0243 (0.0243) time: 0.2899 data: 0.2018 max mem: 5716\n","Epoch: [39804] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0232) time: 0.0964 data: 0.0170 max mem: 5716\n","Epoch: [39804] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39805] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0179 (0.0179) time: 0.2780 data: 0.1887 max mem: 5716\n","Epoch: [39805] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0222 (0.0229) time: 0.0936 data: 0.0159 max mem: 5716\n","Epoch: [39805] Total time: 0:00:01 (0.0976 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0222 (0.0229) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39806] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0257 (0.0257) time: 0.2890 data: 0.1990 max mem: 5716\n","Epoch: [39806] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0246) time: 0.0956 data: 0.0167 max mem: 5716\n","Epoch: [39806] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39807] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0195 (0.0195) time: 0.2845 data: 0.1915 max mem: 5716\n","Epoch: [39807] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0243) time: 0.0946 data: 0.0161 max mem: 5716\n","Epoch: [39807] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39808] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0222 (0.0222) time: 0.2943 data: 0.1934 max mem: 5716\n","Epoch: [39808] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0244) time: 0.0966 data: 0.0163 max mem: 5716\n","Epoch: [39808] Total time: 0:00:01 (0.1008 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39809] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0177 (0.0177) time: 0.2984 data: 0.2041 max mem: 5716\n","Epoch: [39809] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0197 (0.0210) time: 0.0969 data: 0.0171 max mem: 5716\n","Epoch: [39809] Total time: 0:00:01 (0.1014 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0197 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39810] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0198 (0.0198) time: 0.2904 data: 0.1978 max mem: 5716\n","Epoch: [39810] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0198 (0.0209) time: 0.0957 data: 0.0166 max mem: 5716\n","Epoch: [39810] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0198 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39811] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0205 (0.0205) time: 0.2855 data: 0.1962 max mem: 5716\n","Epoch: [39811] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0206) time: 0.0946 data: 0.0165 max mem: 5716\n","Epoch: [39811] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39812] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0222 (0.0222) time: 0.2899 data: 0.1970 max mem: 5716\n","Epoch: [39812] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0222 (0.0233) time: 0.0953 data: 0.0166 max mem: 5716\n","Epoch: [39812] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0222 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39813] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0196 (0.0196) time: 0.2868 data: 0.1956 max mem: 5716\n","Epoch: [39813] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0233) time: 0.0964 data: 0.0165 max mem: 5716\n","Epoch: [39813] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39814] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0226 (0.0226) time: 0.2991 data: 0.2016 max mem: 5716\n","Epoch: [39814] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0231) time: 0.0960 data: 0.0169 max mem: 5716\n","Epoch: [39814] Total time: 0:00:01 (0.1003 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39815] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0280 (0.0280) time: 0.2861 data: 0.1956 max mem: 5716\n","Epoch: [39815] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0239) time: 0.0952 data: 0.0165 max mem: 5716\n","Epoch: [39815] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0239) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39816] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0227 (0.0227) time: 0.2973 data: 0.2091 max mem: 5716\n","Epoch: [39816] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0246 (0.0246) time: 0.0965 data: 0.0176 max mem: 5716\n","Epoch: [39816] Total time: 0:00:01 (0.1006 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0246 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39817] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0251 (0.0251) time: 0.2884 data: 0.2010 max mem: 5716\n","Epoch: [39817] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0251 (0.0245) time: 0.0958 data: 0.0169 max mem: 5716\n","Epoch: [39817] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0251 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39818] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0197 (0.0197) time: 0.2827 data: 0.1920 max mem: 5716\n","Epoch: [39818] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0194 (0.0206) time: 0.0948 data: 0.0162 max mem: 5716\n","Epoch: [39818] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0194 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39819] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0170 (0.0170) time: 0.2806 data: 0.1899 max mem: 5716\n","Epoch: [39819] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0206 (0.0206) time: 0.0941 data: 0.0160 max mem: 5716\n","Epoch: [39819] Total time: 0:00:01 (0.0981 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0206 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39820] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0180 (0.0180) time: 0.2860 data: 0.1964 max mem: 5716\n","Epoch: [39820] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0204 (0.0208) time: 0.0958 data: 0.0165 max mem: 5716\n","Epoch: [39820] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0204 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39821] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2950 data: 0.2094 max mem: 5716\n","Epoch: [39821] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0221 (0.0234) time: 0.0950 data: 0.0176 max mem: 5716\n","Epoch: [39821] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0221 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39822] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0250 (0.0250) time: 0.2899 data: 0.2039 max mem: 5716\n","Epoch: [39822] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0233) time: 0.0947 data: 0.0171 max mem: 5716\n","Epoch: [39822] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39823] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.2887 data: 0.2001 max mem: 5716\n","Epoch: [39823] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0236) time: 0.0949 data: 0.0168 max mem: 5716\n","Epoch: [39823] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0236) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39824] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2910 data: 0.1973 max mem: 5716\n","Epoch: [39824] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0243) time: 0.0948 data: 0.0166 max mem: 5716\n","Epoch: [39824] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39825] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0273 (0.0273) time: 0.2930 data: 0.1989 max mem: 5716\n","Epoch: [39825] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0248 (0.0246) time: 0.0979 data: 0.0167 max mem: 5716\n","Epoch: [39825] Total time: 0:00:01 (0.1020 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0248 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39826] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0220 (0.0220) time: 0.2873 data: 0.1956 max mem: 5716\n","Epoch: [39826] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.0974 data: 0.0165 max mem: 5716\n","Epoch: [39826] Total time: 0:00:01 (0.1015 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39827] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0230 (0.0230) time: 0.2783 data: 0.1874 max mem: 5716\n","Epoch: [39827] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0203 (0.0204) time: 0.0954 data: 0.0158 max mem: 5716\n","Epoch: [39827] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0203 (0.0204) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39828] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0162 (0.0162) time: 0.2792 data: 0.1874 max mem: 5716\n","Epoch: [39828] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0210) time: 0.0948 data: 0.0158 max mem: 5716\n","Epoch: [39828] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39829] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0218 (0.0218) time: 0.2861 data: 0.1959 max mem: 5716\n","Epoch: [39829] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0206) time: 0.0956 data: 0.0165 max mem: 5716\n","Epoch: [39829] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39830] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0212 (0.0212) time: 0.2837 data: 0.1928 max mem: 5716\n","Epoch: [39830] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0231) time: 0.0946 data: 0.0162 max mem: 5716\n","Epoch: [39830] Total time: 0:00:01 (0.0986 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39831] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0200 (0.0200) time: 0.2930 data: 0.2060 max mem: 5716\n","Epoch: [39831] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0212 (0.0228) time: 0.0948 data: 0.0173 max mem: 5716\n","Epoch: [39831] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0212 (0.0228) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39832] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0180 (0.0180) time: 0.2872 data: 0.1957 max mem: 5716\n","Epoch: [39832] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0231) time: 0.0947 data: 0.0164 max mem: 5716\n","Epoch: [39832] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39833] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0202 (0.0202) time: 0.2830 data: 0.1941 max mem: 5716\n","Epoch: [39833] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0244) time: 0.0951 data: 0.0163 max mem: 5716\n","Epoch: [39833] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39834] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0268 (0.0268) time: 0.2839 data: 0.1925 max mem: 5716\n","Epoch: [39834] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0249 (0.0243) time: 0.0950 data: 0.0162 max mem: 5716\n","Epoch: [39834] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0249 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39835] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0232 (0.0232) time: 0.2928 data: 0.2058 max mem: 5716\n","Epoch: [39835] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0242) time: 0.0953 data: 0.0173 max mem: 5716\n","Epoch: [39835] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39836] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0152 (0.0152) time: 0.2955 data: 0.1970 max mem: 5716\n","Epoch: [39836] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0208) time: 0.0959 data: 0.0166 max mem: 5716\n","Epoch: [39836] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39837] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0172 (0.0172) time: 0.2846 data: 0.1957 max mem: 5716\n","Epoch: [39837] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0213 (0.0209) time: 0.0944 data: 0.0164 max mem: 5716\n","Epoch: [39837] Total time: 0:00:01 (0.0986 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0213 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39838] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0227 (0.0227) time: 0.2891 data: 0.1980 max mem: 5716\n","Epoch: [39838] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0209) time: 0.0949 data: 0.0166 max mem: 5716\n","Epoch: [39838] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39839] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0239 (0.0239) time: 0.2862 data: 0.1964 max mem: 5716\n","Epoch: [39839] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0233) time: 0.0944 data: 0.0165 max mem: 5716\n","Epoch: [39839] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39840] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0256 (0.0256) time: 0.2917 data: 0.2033 max mem: 5716\n","Epoch: [39840] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0235) time: 0.0947 data: 0.0171 max mem: 5716\n","Epoch: [39840] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39841] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0259 (0.0259) time: 0.2838 data: 0.1938 max mem: 5716\n","Epoch: [39841] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0233) time: 0.0942 data: 0.0163 max mem: 5716\n","Epoch: [39841] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39842] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0235 (0.0235) time: 0.2871 data: 0.1959 max mem: 5716\n","Epoch: [39842] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0242) time: 0.0949 data: 0.0165 max mem: 5716\n","Epoch: [39842] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39843] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2857 data: 0.1974 max mem: 5716\n","Epoch: [39843] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0243) time: 0.0950 data: 0.0166 max mem: 5716\n","Epoch: [39843] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39844] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0255 (0.0255) time: 0.2909 data: 0.2003 max mem: 5716\n","Epoch: [39844] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0242) time: 0.0964 data: 0.0169 max mem: 5716\n","Epoch: [39844] Total time: 0:00:01 (0.1006 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39845] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0181 (0.0181) time: 0.2961 data: 0.2082 max mem: 5716\n","Epoch: [39845] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0192 (0.0209) time: 0.0987 data: 0.0175 max mem: 5716\n","Epoch: [39845] Total time: 0:00:01 (0.1028 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0192 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39846] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0180 (0.0180) time: 0.2933 data: 0.2058 max mem: 5716\n","Epoch: [39846] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0209 (0.0209) time: 0.0954 data: 0.0173 max mem: 5716\n","Epoch: [39846] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0209 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39847] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0235 (0.0235) time: 0.2917 data: 0.2037 max mem: 5716\n","Epoch: [39847] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0204 (0.0209) time: 0.0951 data: 0.0171 max mem: 5716\n","Epoch: [39847] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0204 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39848] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0234 (0.0234) time: 0.2820 data: 0.1925 max mem: 5716\n","Epoch: [39848] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0231) time: 0.0946 data: 0.0162 max mem: 5716\n","Epoch: [39848] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39849] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0226 (0.0226) time: 0.2823 data: 0.1923 max mem: 5716\n","Epoch: [39849] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0232) time: 0.0944 data: 0.0162 max mem: 5716\n","Epoch: [39849] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39850] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0275 (0.0275) time: 0.2834 data: 0.1916 max mem: 5716\n","Epoch: [39850] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0233) time: 0.0943 data: 0.0161 max mem: 5716\n","Epoch: [39850] Total time: 0:00:01 (0.0984 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39851] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0245 (0.0245) time: 0.2801 data: 0.1894 max mem: 5716\n","Epoch: [39851] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0245 (0.0246) time: 0.0945 data: 0.0159 max mem: 5716\n","Epoch: [39851] Total time: 0:00:01 (0.0986 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0245 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39852] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0253 (0.0253) time: 0.2963 data: 0.1946 max mem: 5716\n","Epoch: [39852] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0244 (0.0241) time: 0.0954 data: 0.0164 max mem: 5716\n","Epoch: [39852] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0244 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39853] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0246 (0.0246) time: 0.2910 data: 0.2034 max mem: 5716\n","Epoch: [39853] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0244) time: 0.0950 data: 0.0171 max mem: 5716\n","Epoch: [39853] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39854] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0171 (0.0171) time: 0.2869 data: 0.1983 max mem: 5716\n","Epoch: [39854] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0210 (0.0211) time: 0.0959 data: 0.0167 max mem: 5716\n","Epoch: [39854] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0210 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39855] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0231 (0.0231) time: 0.2926 data: 0.2047 max mem: 5716\n","Epoch: [39855] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0206) time: 0.0950 data: 0.0172 max mem: 5716\n","Epoch: [39855] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39856] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0165 (0.0165) time: 0.2875 data: 0.1987 max mem: 5716\n","Epoch: [39856] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0209) time: 0.0949 data: 0.0167 max mem: 5716\n","Epoch: [39856] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39857] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0207 (0.0207) time: 0.2920 data: 0.2054 max mem: 5716\n","Epoch: [39857] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0228) time: 0.0950 data: 0.0173 max mem: 5716\n","Epoch: [39857] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0228) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39858] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0230 (0.0230) time: 0.2959 data: 0.2079 max mem: 5716\n","Epoch: [39858] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0231) time: 0.0963 data: 0.0175 max mem: 5716\n","Epoch: [39858] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39859] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0224 (0.0224) time: 0.2839 data: 0.1940 max mem: 5716\n","Epoch: [39859] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0232) time: 0.0950 data: 0.0163 max mem: 5716\n","Epoch: [39859] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39860] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0239 (0.0239) time: 0.2908 data: 0.1997 max mem: 5716\n","Epoch: [39860] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0242) time: 0.0953 data: 0.0168 max mem: 5716\n","Epoch: [39860] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39861] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0204 (0.0204) time: 0.2899 data: 0.1975 max mem: 5716\n","Epoch: [39861] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0244) time: 0.0991 data: 0.0166 max mem: 5716\n","Epoch: [39861] Total time: 0:00:01 (0.1032 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39862] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0306 (0.0306) time: 0.2937 data: 0.2056 max mem: 5716\n","Epoch: [39862] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0246) time: 0.0953 data: 0.0173 max mem: 5716\n","Epoch: [39862] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39863] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0221 (0.0221) time: 0.2945 data: 0.2054 max mem: 5716\n","Epoch: [39863] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0199 (0.0208) time: 0.0954 data: 0.0173 max mem: 5716\n","Epoch: [39863] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0199 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39864] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0173 (0.0173) time: 0.2912 data: 0.1996 max mem: 5716\n","Epoch: [39864] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0211 (0.0210) time: 0.0960 data: 0.0168 max mem: 5716\n","Epoch: [39864] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0211 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39865] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0188 (0.0188) time: 0.2855 data: 0.1970 max mem: 5716\n","Epoch: [39865] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0212) time: 0.0942 data: 0.0166 max mem: 5716\n","Epoch: [39865] Total time: 0:00:01 (0.0984 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0212) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39866] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0263 (0.0263) time: 0.3080 data: 0.2177 max mem: 5716\n","Epoch: [39866] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0233) time: 0.0962 data: 0.0183 max mem: 5716\n","Epoch: [39866] Total time: 0:00:01 (0.1034 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39867] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0213 (0.0213) time: 0.2885 data: 0.2034 max mem: 5716\n","Epoch: [39867] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0231) time: 0.0961 data: 0.0171 max mem: 5716\n","Epoch: [39867] Total time: 0:00:01 (0.1001 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39868] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0197 (0.0197) time: 0.2864 data: 0.1969 max mem: 5716\n","Epoch: [39868] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0234) time: 0.0952 data: 0.0166 max mem: 5716\n","Epoch: [39868] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39869] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0170 (0.0170) time: 0.2868 data: 0.1943 max mem: 5716\n","Epoch: [39869] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0243) time: 0.0986 data: 0.0163 max mem: 5716\n","Epoch: [39869] Total time: 0:00:01 (0.1036 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39870] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0275 (0.0275) time: 0.2971 data: 0.2026 max mem: 5716\n","Epoch: [39870] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0243) time: 0.0955 data: 0.0170 max mem: 5716\n","Epoch: [39870] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39871] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0207 (0.0207) time: 0.2801 data: 0.1909 max mem: 5716\n","Epoch: [39871] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0247) time: 0.0940 data: 0.0161 max mem: 5716\n","Epoch: [39871] Total time: 0:00:01 (0.0980 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0247) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39872] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0250 (0.0250) time: 0.2903 data: 0.1909 max mem: 5716\n","Epoch: [39872] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0217 (0.0208) time: 0.0950 data: 0.0160 max mem: 5716\n","Epoch: [39872] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0217 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39873] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0169 (0.0169) time: 0.2861 data: 0.1888 max mem: 5716\n","Epoch: [39873] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0199 (0.0208) time: 0.0941 data: 0.0159 max mem: 5716\n","Epoch: [39873] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0199 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39874] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0185 (0.0185) time: 0.2919 data: 0.2028 max mem: 5716\n","Epoch: [39874] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0206 (0.0206) time: 0.0951 data: 0.0170 max mem: 5716\n","Epoch: [39874] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0206 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39875] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0203 (0.0203) time: 0.2903 data: 0.2037 max mem: 5716\n","Epoch: [39875] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0235) time: 0.0947 data: 0.0171 max mem: 5716\n","Epoch: [39875] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39876] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0200 (0.0200) time: 0.2938 data: 0.2064 max mem: 5716\n","Epoch: [39876] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0221 (0.0228) time: 0.0951 data: 0.0173 max mem: 5716\n","Epoch: [39876] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0221 (0.0228) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39877] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0235 (0.0235) time: 0.2914 data: 0.2027 max mem: 5716\n","Epoch: [39877] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0223 (0.0231) time: 0.0980 data: 0.0170 max mem: 5716\n","Epoch: [39877] Total time: 0:00:01 (0.1021 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0223 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39878] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0250 (0.0250) time: 0.2920 data: 0.1980 max mem: 5716\n","Epoch: [39878] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0241) time: 0.0957 data: 0.0166 max mem: 5716\n","Epoch: [39878] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39879] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0239 (0.0239) time: 0.2899 data: 0.2028 max mem: 5716\n","Epoch: [39879] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0242) time: 0.0954 data: 0.0170 max mem: 5716\n","Epoch: [39879] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39880] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0246 (0.0246) time: 0.2882 data: 0.1988 max mem: 5716\n","Epoch: [39880] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0244 (0.0241) time: 0.0947 data: 0.0167 max mem: 5716\n","Epoch: [39880] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0244 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39881] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0226 (0.0226) time: 0.2844 data: 0.1987 max mem: 5716\n","Epoch: [39881] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0206 (0.0207) time: 0.0970 data: 0.0167 max mem: 5716\n","Epoch: [39881] Total time: 0:00:01 (0.1012 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0206 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39882] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0215 (0.0215) time: 0.2916 data: 0.2046 max mem: 5716\n","Epoch: [39882] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0209 (0.0208) time: 0.0948 data: 0.0172 max mem: 5716\n","Epoch: [39882] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0209 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39883] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0192 (0.0192) time: 0.2896 data: 0.2004 max mem: 5716\n","Epoch: [39883] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0206) time: 0.0947 data: 0.0168 max mem: 5716\n","Epoch: [39883] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39884] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0253 (0.0253) time: 0.3036 data: 0.2053 max mem: 5716\n","Epoch: [39884] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0233) time: 0.0965 data: 0.0173 max mem: 5716\n","Epoch: [39884] Total time: 0:00:01 (0.1006 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39885] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0240 (0.0240) time: 0.3003 data: 0.2125 max mem: 5716\n","Epoch: [39885] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0236) time: 0.0969 data: 0.0179 max mem: 5716\n","Epoch: [39885] Total time: 0:00:01 (0.1010 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0236) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39886] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0199 (0.0199) time: 0.2989 data: 0.2111 max mem: 5716\n","Epoch: [39886] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0219 (0.0230) time: 0.0959 data: 0.0178 max mem: 5716\n","Epoch: [39886] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0219 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39887] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0229 (0.0229) time: 0.2821 data: 0.1928 max mem: 5716\n","Epoch: [39887] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0252 (0.0245) time: 0.0943 data: 0.0162 max mem: 5716\n","Epoch: [39887] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0252 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39888] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0224 (0.0224) time: 0.2961 data: 0.1993 max mem: 5716\n","Epoch: [39888] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0242 (0.0243) time: 0.0952 data: 0.0167 max mem: 5716\n","Epoch: [39888] Total time: 0:00:01 (0.1007 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0242 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39889] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0219 (0.0219) time: 0.2851 data: 0.1957 max mem: 5716\n","Epoch: [39889] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0246 (0.0245) time: 0.0949 data: 0.0165 max mem: 5716\n","Epoch: [39889] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0246 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39890] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0160 (0.0160) time: 0.2920 data: 0.1973 max mem: 5716\n","Epoch: [39890] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0208) time: 0.0949 data: 0.0166 max mem: 5716\n","Epoch: [39890] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39891] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0159 (0.0159) time: 0.2904 data: 0.2033 max mem: 5716\n","Epoch: [39891] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.0948 data: 0.0171 max mem: 5716\n","Epoch: [39891] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39892] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0215 (0.0215) time: 0.2812 data: 0.1919 max mem: 5716\n","Epoch: [39892] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.0939 data: 0.0161 max mem: 5716\n","Epoch: [39892] Total time: 0:00:01 (0.0979 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39893] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0219 (0.0219) time: 0.2849 data: 0.1980 max mem: 5716\n","Epoch: [39893] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0230) time: 0.0942 data: 0.0167 max mem: 5716\n","Epoch: [39893] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39894] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0202 (0.0202) time: 0.2939 data: 0.2006 max mem: 5716\n","Epoch: [39894] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0233) time: 0.0969 data: 0.0169 max mem: 5716\n","Epoch: [39894] Total time: 0:00:01 (0.1022 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39895] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0292 (0.0292) time: 0.2840 data: 0.1918 max mem: 5716\n","Epoch: [39895] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0233) time: 0.0956 data: 0.0161 max mem: 5716\n","Epoch: [39895] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39896] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0261 (0.0261) time: 0.2865 data: 0.1966 max mem: 5716\n","Epoch: [39896] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0251 (0.0244) time: 0.0948 data: 0.0165 max mem: 5716\n","Epoch: [39896] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0251 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39897] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0221 (0.0221) time: 0.2846 data: 0.1959 max mem: 5716\n","Epoch: [39897] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.0946 data: 0.0165 max mem: 5716\n","Epoch: [39897] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39898] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0242 (0.0242) time: 0.2873 data: 0.1970 max mem: 5716\n","Epoch: [39898] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0246) time: 0.0950 data: 0.0166 max mem: 5716\n","Epoch: [39898] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39899] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0159 (0.0159) time: 0.2857 data: 0.1955 max mem: 5716\n","Epoch: [39899] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0203 (0.0208) time: 0.0950 data: 0.0164 max mem: 5716\n","Epoch: [39899] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0203 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39900] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0206 (0.0206) time: 0.2858 data: 0.1963 max mem: 5716\n","Epoch: [39900] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0207) time: 0.0956 data: 0.0165 max mem: 5716\n","Epoch: [39900] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39901] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0212 (0.0212) time: 0.2903 data: 0.2038 max mem: 5716\n","Epoch: [39901] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0211) time: 0.0948 data: 0.0171 max mem: 5716\n","Epoch: [39901] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39902] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0253 (0.0253) time: 0.2925 data: 0.2059 max mem: 5716\n","Epoch: [39902] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0236) time: 0.0950 data: 0.0173 max mem: 5716\n","Epoch: [39902] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0236) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39903] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0228 (0.0228) time: 0.2855 data: 0.1939 max mem: 5716\n","Epoch: [39903] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0234) time: 0.0952 data: 0.0163 max mem: 5716\n","Epoch: [39903] Total time: 0:00:01 (0.1008 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39904] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0245 (0.0245) time: 0.3069 data: 0.2161 max mem: 5716\n","Epoch: [39904] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0215 (0.0227) time: 0.1001 data: 0.0182 max mem: 5716\n","Epoch: [39904] Total time: 0:00:01 (0.1042 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0215 (0.0227) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39905] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0275 (0.0275) time: 0.2878 data: 0.1976 max mem: 5716\n","Epoch: [39905] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0238) time: 0.0958 data: 0.0166 max mem: 5716\n","Epoch: [39905] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0238) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39906] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0192 (0.0192) time: 0.2968 data: 0.2001 max mem: 5716\n","Epoch: [39906] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0243) time: 0.0955 data: 0.0168 max mem: 5716\n","Epoch: [39906] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39907] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0199 (0.0199) time: 0.2914 data: 0.2055 max mem: 5716\n","Epoch: [39907] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0248 (0.0246) time: 0.0948 data: 0.0173 max mem: 5716\n","Epoch: [39907] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0248 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39908] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0173 (0.0173) time: 0.2855 data: 0.1958 max mem: 5716\n","Epoch: [39908] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0208) time: 0.0956 data: 0.0165 max mem: 5716\n","Epoch: [39908] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39909] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0191 (0.0191) time: 0.2947 data: 0.2070 max mem: 5716\n","Epoch: [39909] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0193 (0.0207) time: 0.0954 data: 0.0174 max mem: 5716\n","Epoch: [39909] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0193 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39910] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0285 (0.0285) time: 0.2918 data: 0.2030 max mem: 5716\n","Epoch: [39910] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0208) time: 0.0947 data: 0.0170 max mem: 5716\n","Epoch: [39910] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39911] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.2896 data: 0.1965 max mem: 5716\n","Epoch: [39911] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0232) time: 0.0985 data: 0.0165 max mem: 5716\n","Epoch: [39911] Total time: 0:00:01 (0.1027 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39912] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0301 (0.0301) time: 0.2783 data: 0.1896 max mem: 5716\n","Epoch: [39912] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0223 (0.0232) time: 0.0943 data: 0.0160 max mem: 5716\n","Epoch: [39912] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0223 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39913] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0210 (0.0210) time: 0.2927 data: 0.1945 max mem: 5716\n","Epoch: [39913] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0232) time: 0.0949 data: 0.0164 max mem: 5716\n","Epoch: [39913] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39914] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0237 (0.0237) time: 0.2910 data: 0.1938 max mem: 5716\n","Epoch: [39914] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0243) time: 0.0954 data: 0.0163 max mem: 5716\n","Epoch: [39914] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39915] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0265 (0.0265) time: 0.2838 data: 0.1921 max mem: 5716\n","Epoch: [39915] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0241) time: 0.0950 data: 0.0162 max mem: 5716\n","Epoch: [39915] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39916] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0249 (0.0249) time: 0.2802 data: 0.1892 max mem: 5716\n","Epoch: [39916] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0242 (0.0247) time: 0.0941 data: 0.0159 max mem: 5716\n","Epoch: [39916] Total time: 0:00:01 (0.0982 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0242 (0.0247) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39917] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0240 (0.0240) time: 0.2819 data: 0.1920 max mem: 5716\n","Epoch: [39917] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0206) time: 0.0943 data: 0.0161 max mem: 5716\n","Epoch: [39917] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39918] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0181 (0.0181) time: 0.2849 data: 0.1963 max mem: 5716\n","Epoch: [39918] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0207) time: 0.0947 data: 0.0165 max mem: 5716\n","Epoch: [39918] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39919] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0199 (0.0199) time: 0.2943 data: 0.2062 max mem: 5716\n","Epoch: [39919] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0196 (0.0209) time: 0.0958 data: 0.0174 max mem: 5716\n","Epoch: [39919] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0196 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39920] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0219 (0.0219) time: 0.2956 data: 0.2021 max mem: 5716\n","Epoch: [39920] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0220 (0.0229) time: 0.0960 data: 0.0170 max mem: 5716\n","Epoch: [39920] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0220 (0.0229) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39921] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0233 (0.0233) time: 0.2869 data: 0.1975 max mem: 5716\n","Epoch: [39921] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0237) time: 0.0961 data: 0.0166 max mem: 5716\n","Epoch: [39921] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0237) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39922] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0205 (0.0205) time: 0.2975 data: 0.2094 max mem: 5716\n","Epoch: [39922] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0232) time: 0.0956 data: 0.0176 max mem: 5716\n","Epoch: [39922] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39923] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0310 (0.0310) time: 0.2917 data: 0.2004 max mem: 5716\n","Epoch: [39923] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0245) time: 0.0959 data: 0.0169 max mem: 5716\n","Epoch: [39923] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39924] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0221 (0.0221) time: 0.2885 data: 0.1977 max mem: 5716\n","Epoch: [39924] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0245 (0.0245) time: 0.0951 data: 0.0166 max mem: 5716\n","Epoch: [39924] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0245 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39925] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0200 (0.0200) time: 0.2845 data: 0.1944 max mem: 5716\n","Epoch: [39925] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0242) time: 0.0948 data: 0.0164 max mem: 5716\n","Epoch: [39925] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39926] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0186 (0.0186) time: 0.2945 data: 0.2040 max mem: 5716\n","Epoch: [39926] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0208) time: 0.0959 data: 0.0172 max mem: 5716\n","Epoch: [39926] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39927] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0281 (0.0281) time: 0.2994 data: 0.2107 max mem: 5716\n","Epoch: [39927] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0208) time: 0.0958 data: 0.0177 max mem: 5716\n","Epoch: [39927] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39928] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.3006 data: 0.2132 max mem: 5716\n","Epoch: [39928] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0192 (0.0207) time: 0.0962 data: 0.0179 max mem: 5716\n","Epoch: [39928] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0192 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39929] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0233 (0.0233) time: 0.2871 data: 0.1972 max mem: 5716\n","Epoch: [39929] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0232) time: 0.0949 data: 0.0166 max mem: 5716\n","Epoch: [39929] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39930] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0245 (0.0245) time: 0.2902 data: 0.2019 max mem: 5716\n","Epoch: [39930] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0232) time: 0.0963 data: 0.0170 max mem: 5716\n","Epoch: [39930] Total time: 0:00:01 (0.1015 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39931] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0245 (0.0245) time: 0.2850 data: 0.1957 max mem: 5716\n","Epoch: [39931] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0228 (0.0229) time: 0.0946 data: 0.0165 max mem: 5716\n","Epoch: [39931] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0228 (0.0229) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39932] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0278 (0.0278) time: 0.2972 data: 0.2096 max mem: 5716\n","Epoch: [39932] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0243 (0.0242) time: 0.0956 data: 0.0176 max mem: 5716\n","Epoch: [39932] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0243 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39933] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0229 (0.0229) time: 0.2820 data: 0.1927 max mem: 5716\n","Epoch: [39933] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0244) time: 0.0942 data: 0.0162 max mem: 5716\n","Epoch: [39933] Total time: 0:00:01 (0.0983 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39934] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0244 (0.0244) time: 0.2879 data: 0.1984 max mem: 5716\n","Epoch: [39934] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0227 (0.0237) time: 0.0947 data: 0.0167 max mem: 5716\n","Epoch: [39934] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0227 (0.0237) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39935] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0225 (0.0225) time: 0.2936 data: 0.2049 max mem: 5716\n","Epoch: [39935] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0209) time: 0.0957 data: 0.0172 max mem: 5716\n","Epoch: [39935] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39936] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0219 (0.0219) time: 0.2897 data: 0.1987 max mem: 5716\n","Epoch: [39936] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0202 (0.0208) time: 0.0956 data: 0.0167 max mem: 5716\n","Epoch: [39936] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0202 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39937] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0231 (0.0231) time: 0.2914 data: 0.2031 max mem: 5716\n","Epoch: [39937] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.0954 data: 0.0171 max mem: 5716\n","Epoch: [39937] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39938] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0229 (0.0229) time: 0.2989 data: 0.1956 max mem: 5716\n","Epoch: [39938] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0232) time: 0.0959 data: 0.0165 max mem: 5716\n","Epoch: [39938] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39939] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0256 (0.0256) time: 0.2887 data: 0.1949 max mem: 5716\n","Epoch: [39939] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0232) time: 0.0978 data: 0.0164 max mem: 5716\n","Epoch: [39939] Total time: 0:00:01 (0.1020 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39940] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0284 (0.0284) time: 0.2880 data: 0.1989 max mem: 5716\n","Epoch: [39940] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0219 (0.0232) time: 0.0946 data: 0.0167 max mem: 5716\n","Epoch: [39940] Total time: 0:00:01 (0.0988 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0219 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39941] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0174 (0.0174) time: 0.2859 data: 0.1969 max mem: 5716\n","Epoch: [39941] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0240) time: 0.0951 data: 0.0166 max mem: 5716\n","Epoch: [39941] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0240) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39942] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0194 (0.0194) time: 0.2897 data: 0.2033 max mem: 5716\n","Epoch: [39942] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0241) time: 0.0957 data: 0.0171 max mem: 5716\n","Epoch: [39942] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0241) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39943] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0289 (0.0289) time: 0.2835 data: 0.1940 max mem: 5716\n","Epoch: [39943] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0248 (0.0244) time: 0.0974 data: 0.0163 max mem: 5716\n","Epoch: [39943] Total time: 0:00:01 (0.1016 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0248 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39944] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0221 (0.0221) time: 0.2894 data: 0.2006 max mem: 5716\n","Epoch: [39944] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0208 (0.0206) time: 0.0951 data: 0.0169 max mem: 5716\n","Epoch: [39944] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0208 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39945] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0158 (0.0158) time: 0.2924 data: 0.1985 max mem: 5716\n","Epoch: [39945] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0209 (0.0206) time: 0.0958 data: 0.0167 max mem: 5716\n","Epoch: [39945] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0209 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39946] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0274 (0.0274) time: 0.2913 data: 0.2048 max mem: 5716\n","Epoch: [39946] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0197 (0.0207) time: 0.0955 data: 0.0172 max mem: 5716\n","Epoch: [39946] Total time: 0:00:01 (0.0996 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0197 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39947] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0233 (0.0233) time: 0.2927 data: 0.2011 max mem: 5716\n","Epoch: [39947] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0233) time: 0.0969 data: 0.0169 max mem: 5716\n","Epoch: [39947] Total time: 0:00:01 (0.1011 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39948] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0241 (0.0241) time: 0.2860 data: 0.1948 max mem: 5716\n","Epoch: [39948] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0231) time: 0.0943 data: 0.0164 max mem: 5716\n","Epoch: [39948] Total time: 0:00:01 (0.0984 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39949] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0228 (0.0228) time: 0.2854 data: 0.1949 max mem: 5716\n","Epoch: [39949] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0232) time: 0.0944 data: 0.0164 max mem: 5716\n","Epoch: [39949] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39950] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0236 (0.0236) time: 0.2852 data: 0.1960 max mem: 5716\n","Epoch: [39950] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0246 (0.0244) time: 0.0948 data: 0.0165 max mem: 5716\n","Epoch: [39950] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0246 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39951] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0216 (0.0216) time: 0.2973 data: 0.1968 max mem: 5716\n","Epoch: [39951] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0246) time: 0.0957 data: 0.0166 max mem: 5716\n","Epoch: [39951] Total time: 0:00:01 (0.0998 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0246) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39952] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0257 (0.0257) time: 0.2933 data: 0.2065 max mem: 5716\n","Epoch: [39952] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0242 (0.0243) time: 0.0950 data: 0.0173 max mem: 5716\n","Epoch: [39952] Total time: 0:00:01 (0.0992 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0242 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39953] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0151 (0.0151) time: 0.2861 data: 0.1968 max mem: 5716\n","Epoch: [39953] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0201 (0.0207) time: 0.0950 data: 0.0165 max mem: 5716\n","Epoch: [39953] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0201 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39954] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0177 (0.0177) time: 0.2907 data: 0.1996 max mem: 5716\n","Epoch: [39954] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0210) time: 0.0952 data: 0.0168 max mem: 5716\n","Epoch: [39954] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0210) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39955] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0174 (0.0174) time: 0.2848 data: 0.1953 max mem: 5716\n","Epoch: [39955] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0198 (0.0204) time: 0.0944 data: 0.0164 max mem: 5716\n","Epoch: [39955] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0198 (0.0204) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39956] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0263 (0.0263) time: 0.2876 data: 0.2002 max mem: 5716\n","Epoch: [39956] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0221 (0.0232) time: 0.0946 data: 0.0168 max mem: 5716\n","Epoch: [39956] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0221 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39957] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0261 (0.0261) time: 0.2889 data: 0.1984 max mem: 5716\n","Epoch: [39957] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0237) time: 0.0955 data: 0.0167 max mem: 5716\n","Epoch: [39957] Total time: 0:00:01 (0.0997 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0237) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39958] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0267 (0.0267) time: 0.2839 data: 0.1946 max mem: 5716\n","Epoch: [39958] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0230) time: 0.0947 data: 0.0164 max mem: 5716\n","Epoch: [39958] Total time: 0:00:01 (0.0989 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39959] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0224 (0.0224) time: 0.2860 data: 0.1967 max mem: 5716\n","Epoch: [39959] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0243) time: 0.0945 data: 0.0165 max mem: 5716\n","Epoch: [39959] Total time: 0:00:01 (0.0986 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39960] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0271 (0.0271) time: 0.2851 data: 0.1935 max mem: 5716\n","Epoch: [39960] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0240) time: 0.0948 data: 0.0163 max mem: 5716\n","Epoch: [39960] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0240) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39961] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0268 (0.0268) time: 0.2989 data: 0.2105 max mem: 5716\n","Epoch: [39961] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0239 (0.0245) time: 0.0967 data: 0.0177 max mem: 5716\n","Epoch: [39961] Total time: 0:00:01 (0.1022 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0239 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39962] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0193 (0.0193) time: 0.2939 data: 0.2054 max mem: 5716\n","Epoch: [39962] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0195 (0.0206) time: 0.0958 data: 0.0173 max mem: 5716\n","Epoch: [39962] Total time: 0:00:01 (0.1000 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0195 (0.0206) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39963] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0179 (0.0179) time: 0.2927 data: 0.2042 max mem: 5716\n","Epoch: [39963] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0186 (0.0209) time: 0.0953 data: 0.0172 max mem: 5716\n","Epoch: [39963] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0186 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39964] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0207 (0.0207) time: 0.3069 data: 0.2036 max mem: 5716\n","Epoch: [39964] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0204 (0.0209) time: 0.0969 data: 0.0171 max mem: 5716\n","Epoch: [39964] Total time: 0:00:01 (0.1011 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0204 (0.0209) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39965] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0222 (0.0222) time: 0.2856 data: 0.1969 max mem: 5716\n","Epoch: [39965] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0228) time: 0.0941 data: 0.0165 max mem: 5716\n","Epoch: [39965] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0228) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39966] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0210 (0.0210) time: 0.2983 data: 0.1982 max mem: 5716\n","Epoch: [39966] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0225 (0.0232) time: 0.0965 data: 0.0166 max mem: 5716\n","Epoch: [39966] Total time: 0:00:01 (0.1008 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0225 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39967] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0243 (0.0243) time: 0.2984 data: 0.2049 max mem: 5716\n","Epoch: [39967] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0226 (0.0233) time: 0.0952 data: 0.0172 max mem: 5716\n","Epoch: [39967] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0226 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39968] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0262 (0.0262) time: 0.2845 data: 0.1961 max mem: 5716\n","Epoch: [39968] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0245) time: 0.0941 data: 0.0165 max mem: 5716\n","Epoch: [39968] Total time: 0:00:01 (0.0984 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39969] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0209 (0.0209) time: 0.2846 data: 0.1950 max mem: 5716\n","Epoch: [39969] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0240 (0.0244) time: 0.0943 data: 0.0164 max mem: 5716\n","Epoch: [39969] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0240 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39970] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0211 (0.0211) time: 0.2852 data: 0.1950 max mem: 5716\n","Epoch: [39970] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0233 (0.0244) time: 0.0950 data: 0.0164 max mem: 5716\n","Epoch: [39970] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0233 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39971] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0208 (0.0208) time: 0.3030 data: 0.2058 max mem: 5716\n","Epoch: [39971] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0198 (0.0207) time: 0.0977 data: 0.0173 max mem: 5716\n","Epoch: [39971] Total time: 0:00:01 (0.1019 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0198 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39972] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0205 (0.0205) time: 0.2925 data: 0.1988 max mem: 5716\n","Epoch: [39972] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0205 (0.0207) time: 0.0981 data: 0.0167 max mem: 5716\n","Epoch: [39972] Total time: 0:00:01 (0.1023 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0205 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39973] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0167 (0.0167) time: 0.2938 data: 0.2049 max mem: 5716\n","Epoch: [39973] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0207 (0.0207) time: 0.0987 data: 0.0172 max mem: 5716\n","Epoch: [39973] Total time: 0:00:01 (0.1028 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0207 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39974] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0226 (0.0226) time: 0.2919 data: 0.2046 max mem: 5716\n","Epoch: [39974] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0232 (0.0232) time: 0.0948 data: 0.0172 max mem: 5716\n","Epoch: [39974] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0232 (0.0232) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39975] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0218 (0.0218) time: 0.2844 data: 0.1959 max mem: 5716\n","Epoch: [39975] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0224 (0.0235) time: 0.0943 data: 0.0165 max mem: 5716\n","Epoch: [39975] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0224 (0.0235) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39976] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0216 (0.0216) time: 0.2888 data: 0.2026 max mem: 5716\n","Epoch: [39976] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0220 (0.0233) time: 0.0945 data: 0.0170 max mem: 5716\n","Epoch: [39976] Total time: 0:00:01 (0.0986 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0220 (0.0233) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39977] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0247 (0.0247) time: 0.2842 data: 0.1956 max mem: 5716\n","Epoch: [39977] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0244 (0.0242) time: 0.0944 data: 0.0164 max mem: 5716\n","Epoch: [39977] Total time: 0:00:01 (0.0985 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0244 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39978] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0191 (0.0191) time: 0.2922 data: 0.2048 max mem: 5716\n","Epoch: [39978] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0246 (0.0243) time: 0.0952 data: 0.0172 max mem: 5716\n","Epoch: [39978] Total time: 0:00:01 (0.0993 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0246 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39979] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0286 (0.0286) time: 0.2807 data: 0.1871 max mem: 5716\n","Epoch: [39979] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0236 (0.0245) time: 0.0946 data: 0.0158 max mem: 5716\n","Epoch: [39979] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0236 (0.0245) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39980] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0223 (0.0223) time: 0.2803 data: 0.1911 max mem: 5716\n","Epoch: [39980] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0188 (0.0205) time: 0.0941 data: 0.0161 max mem: 5716\n","Epoch: [39980] Total time: 0:00:01 (0.0982 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0188 (0.0205) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39981] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0223 (0.0223) time: 0.2977 data: 0.2065 max mem: 5716\n","Epoch: [39981] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0204 (0.0208) time: 0.0962 data: 0.0174 max mem: 5716\n","Epoch: [39981] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0204 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39982] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0197 (0.0197) time: 0.2786 data: 0.1891 max mem: 5716\n","Epoch: [39982] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0204 (0.0208) time: 0.0941 data: 0.0159 max mem: 5716\n","Epoch: [39982] Total time: 0:00:01 (0.0982 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0204 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39983] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0235 (0.0235) time: 0.2895 data: 0.2017 max mem: 5716\n","Epoch: [39983] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0229 (0.0230) time: 0.0953 data: 0.0170 max mem: 5716\n","Epoch: [39983] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0229 (0.0230) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39984] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0242 (0.0242) time: 0.2954 data: 0.2106 max mem: 5716\n","Epoch: [39984] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0231) time: 0.0955 data: 0.0177 max mem: 5716\n","Epoch: [39984] Total time: 0:00:01 (0.0995 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39985] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0250 (0.0250) time: 0.2868 data: 0.2009 max mem: 5716\n","Epoch: [39985] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0231 (0.0234) time: 0.0946 data: 0.0169 max mem: 5716\n","Epoch: [39985] Total time: 0:00:01 (0.0987 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0231 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39986] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0230 (0.0230) time: 0.2831 data: 0.1945 max mem: 5716\n","Epoch: [39986] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0244) time: 0.0940 data: 0.0164 max mem: 5716\n","Epoch: [39986] Total time: 0:00:01 (0.0982 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0244) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39987] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0302 (0.0302) time: 0.2867 data: 0.1967 max mem: 5716\n","Epoch: [39987] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0234 (0.0243) time: 0.0948 data: 0.0165 max mem: 5716\n","Epoch: [39987] Total time: 0:00:01 (0.0991 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0234 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39988] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0286 (0.0286) time: 0.2916 data: 0.2012 max mem: 5716\n","Epoch: [39988] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0238 (0.0243) time: 0.0952 data: 0.0169 max mem: 5716\n","Epoch: [39988] Total time: 0:00:01 (0.0994 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0238 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39989] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0231 (0.0231) time: 0.2851 data: 0.1950 max mem: 5716\n","Epoch: [39989] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0200 (0.0207) time: 0.0948 data: 0.0164 max mem: 5716\n","Epoch: [39989] Total time: 0:00:01 (0.0990 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0200 (0.0207) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39990] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0203 (0.0203) time: 0.3000 data: 0.2131 max mem: 5716\n","Epoch: [39990] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0204 (0.0208) time: 0.0961 data: 0.0179 max mem: 5716\n","Epoch: [39990] Total time: 0:00:01 (0.1002 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0204 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39991] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0246 (0.0246) time: 0.2931 data: 0.2050 max mem: 5716\n","Epoch: [39991] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0210 (0.0208) time: 0.0978 data: 0.0172 max mem: 5716\n","Epoch: [39991] Total time: 0:00:01 (0.1020 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0210 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39992] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0255 (0.0255) time: 0.2835 data: 0.1954 max mem: 5716\n","Epoch: [39992] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0237 (0.0231) time: 0.0939 data: 0.0164 max mem: 5716\n","Epoch: [39992] Total time: 0:00:01 (0.0981 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0237 (0.0231) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39993] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0243 (0.0243) time: 0.2853 data: 0.1943 max mem: 5716\n","Epoch: [39993] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0235 (0.0234) time: 0.0943 data: 0.0164 max mem: 5716\n","Epoch: [39993] Total time: 0:00:01 (0.0984 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0235 (0.0234) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39994] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0220 (0.0220) time: 0.2834 data: 0.1945 max mem: 5716\n","Epoch: [39994] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0220 (0.0226) time: 0.0939 data: 0.0164 max mem: 5716\n","Epoch: [39994] Total time: 0:00:01 (0.0980 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0220 (0.0226) fix_position_ratio: 0.25 puzzle_patch_size: 32\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39995] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0207 (0.0207) time: 0.2875 data: 0.1968 max mem: 5716\n","Epoch: [39995] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0230 (0.0243) time: 0.0964 data: 0.0165 max mem: 5716\n","Epoch: [39995] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0230 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39996] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0250 (0.0250) time: 0.2857 data: 0.1941 max mem: 5716\n","Epoch: [39996] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0249 (0.0242) time: 0.0955 data: 0.0163 max mem: 5716\n","Epoch: [39996] Total time: 0:00:01 (0.1005 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0249 (0.0242) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39997] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0232 (0.0232) time: 0.2845 data: 0.1943 max mem: 5716\n","Epoch: [39997] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0241 (0.0243) time: 0.0958 data: 0.0163 max mem: 5716\n","Epoch: [39997] Total time: 0:00:01 (0.0999 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0241 (0.0243) fix_position_ratio: 0.25 puzzle_patch_size: 112\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39998] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0228 (0.0228) time: 0.2939 data: 0.1968 max mem: 5716\n","Epoch: [39998] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0213 (0.0208) time: 0.0974 data: 0.0165 max mem: 5716\n","Epoch: [39998] Total time: 0:00:01 (0.1034 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0213 (0.0208) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","log_dir: /home/Pathology_Experiment/imaging_results/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20\n","Epoch: [39999] [ 0/12] eta: 0:00:03 lr: 0.000000 loss: 0.0227 (0.0227) time: 0.3011 data: 0.2095 max mem: 5716\n","Epoch: [39999] [11/12] eta: 0:00:00 lr: 0.000000 loss: 0.0222 (0.0211) time: 0.0963 data: 0.0176 max mem: 5716\n","Epoch: [39999] Total time: 0:00:01 (0.1004 s / it)\n","Averaged stats: lr: 0.000000 loss: 0.0222 (0.0211) fix_position_ratio: 0.25 puzzle_patch_size: 16\n","Figure(640x480)\n","Training time 17:43:58\n"]}],"source":["!python PuzzleTuning.py --model sae_vit_base_patch16 --PromptTuning Deep --batch_size 32 --group_shuffle_size 8 --strategy loop --blr 1.5e-5 --epochs 40000 --warmup_epochs 20 --accum_iter 2 --print_freq 200 --check_point_gap 10000 --input_size 224 --pin_mem --num_workers 2 --basic_state_dict timm --data_path /data/Pathology_Experiment/dataset/PuzzleTuning_demoset --output_dir /home/Pathology_Experiment/runs --log_dir /home/Pathology_Experiment/imaging_results"]},{"cell_type":"markdown","metadata":{"id":"toMbJKBKa7Sw"},"source":["Visulization"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"v4jPIn7DbAaX"},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","\n","Testing_PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20_b_8_hint_ratio_0.5_patch_size_16\n","\n","\n","Use 1 GPUs of idx: 0\n","job dir: /home/Pathology_Experiment/code\n","Namespace(model_idx='Testing_PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20_b_8_hint_ratio_0.5_patch_size_16',\n","batch_size=8,\n","model='sae_vit_base_patch16',\n","seg_decoder=None,\n","input_size=224,\n","num_classes=3,\n","mask_ratio=None,\n","fix_position_ratio=0.5,\n","fix_patch_size=16,\n","group_shuffle_size=-1,\n","shuffle_dataloader=False,\n","PromptTuning='Deep',\n","Prompt_Token_num=20,\n","norm_pix_loss=False,\n","data_path='/data/Pathology_Experiment/dataset/PuzzleTuning_demoset',\n","output_dir='/home/Pathology_Experiment/imaging_results',\n","log_dir='/home/Pathology_Experiment/imaging_results',\n","gpu_idx=0,\n","device='cuda',\n","seed=42,\n","checkpoint_path='/home/Pathology_Experiment/runs/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20_checkpoint-39999.pth',\n","combined_pred_illustration=False,\n","enable_visualize_check=True,\n","check_minibatch=None,\n","check_samples=1,\n","num_workers=10,\n","pin_mem=True,\n","gpu=1)\n","dataset_test Dataset ImageFolder\n"," Number of datapoints: 400\n"," Root location: /data/Pathology_Experiment/dataset/PuzzleTuning_demoset\n"," StandardTransform\n","Transform: Compose(\n"," Resize(size=224, interpolation=bilinear, max_size=None, antialias=warn)\n"," ToTensor()\n"," )\n","Testing output files will be at /home/Pathology_Experiment/imaging_results/Testing_PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20_b_8_hint_ratio_0.5_patch_size_16\n","Decoder: None\n","Start testing for Testing_PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20_b_8_hint_ratio_0.5_patch_size_16 \n"," with checkpoint: /home/Pathology_Experiment/runs/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20_checkpoint-39999.pth\n","Test index 1 of 1 minibatch with batch_size of 8 time used: 2.576171636581421\n","minibatch AVG loss: 0.026270559057593346\n","Figure(640x480)\n","Test index 2 of 1 minibatch with batch_size of 8 time used: 0.6659231185913086\n","minibatch AVG loss: 0.024316485971212387\n","Figure(640x480)\n","Test index 3 of 1 minibatch with batch_size of 8 time used: 0.34589219093322754\n","minibatch AVG loss: 0.029779721051454544\n","Figure(640x480)\n","Test index 4 of 1 minibatch with batch_size of 8 time used: 0.3440580368041992\n","minibatch AVG loss: 0.034707095474004745\n","Figure(640x480)\n","Test index 5 of 1 minibatch with batch_size of 8 time used: 0.3439669609069824\n","minibatch AVG loss: 0.029156949371099472\n","Figure(640x480)\n","Test index 6 of 1 minibatch with batch_size of 8 time used: 0.3427398204803467\n","minibatch AVG loss: 0.02971576154232025\n","Figure(640x480)\n","Test index 7 of 1 minibatch with batch_size of 8 time used: 0.5957798957824707\n","minibatch AVG loss: 0.0266494769603014\n","Figure(640x480)\n","Test index 8 of 1 minibatch with batch_size of 8 time used: 0.34395861625671387\n","minibatch AVG loss: 0.034395135939121246\n","Figure(640x480)\n","Test index 9 of 1 minibatch with batch_size of 8 time used: 0.3422093391418457\n","minibatch AVG loss: 0.01643659919500351\n","Figure(640x480)\n","Test index 10 of 1 minibatch with batch_size of 8 time used: 0.34267401695251465\n","minibatch AVG loss: 0.027027731761336327\n","Figure(640x480)\n","Test index 11 of 1 minibatch with batch_size of 8 time used: 0.34317874908447266\n","minibatch AVG loss: 0.03650519251823425\n","Figure(640x480)\n","Test index 12 of 1 minibatch with batch_size of 8 time used: 0.34625768661499023\n","minibatch AVG loss: 0.023865383118391037\n","Figure(640x480)\n","Test index 13 of 1 minibatch with batch_size of 8 time used: 0.34453797340393066\n","minibatch AVG loss: 0.027605310082435608\n","Figure(640x480)\n","Test index 14 of 1 minibatch with batch_size of 8 time used: 0.35005617141723633\n","minibatch AVG loss: 0.05249357223510742\n","Figure(640x480)\n","Test index 15 of 1 minibatch with batch_size of 8 time used: 0.34470367431640625\n","minibatch AVG loss: 0.06539571285247803\n","Figure(640x480)\n","Test index 16 of 1 minibatch with batch_size of 8 time used: 0.34235501289367676\n","minibatch AVG loss: 0.0607786625623703\n","Figure(640x480)\n","Test index 17 of 1 minibatch with batch_size of 8 time used: 0.5056579113006592\n","minibatch AVG loss: 0.0522107295691967\n","Figure(640x480)\n","Test index 18 of 1 minibatch with batch_size of 8 time used: 0.3416411876678467\n","minibatch AVG loss: 0.04164537042379379\n","Figure(640x480)\n","Test index 19 of 1 minibatch with batch_size of 8 time used: 0.341935396194458\n","minibatch AVG loss: 0.05334985628724098\n","Figure(640x480)\n","Test index 20 of 1 minibatch with batch_size of 8 time used: 0.34108638763427734\n","minibatch AVG loss: 0.04166930913925171\n","Figure(640x480)\n","Test index 21 of 1 minibatch with batch_size of 8 time used: 0.34204626083374023\n","minibatch AVG loss: 0.0406920425593853\n","Figure(640x480)\n","Test index 22 of 1 minibatch with batch_size of 8 time used: 0.3456096649169922\n","minibatch AVG loss: 0.04499432444572449\n","Figure(640x480)\n","Test index 23 of 1 minibatch with batch_size of 8 time used: 0.3513031005859375\n","minibatch AVG loss: 0.049177490174770355\n","Figure(640x480)\n","Test index 24 of 1 minibatch with batch_size of 8 time used: 0.34375739097595215\n","minibatch AVG loss: 0.0517561100423336\n","Figure(640x480)\n","Test index 25 of 1 minibatch with batch_size of 8 time used: 0.35756635665893555\n","minibatch AVG loss: 0.053525157272815704\n","Figure(640x480)\n","Test index 26 of 1 minibatch with batch_size of 8 time used: 0.3438384532928467\n","minibatch AVG loss: 0.016326481476426125\n","Figure(640x480)\n","Test index 27 of 1 minibatch with batch_size of 8 time used: 0.5202908515930176\n","minibatch AVG loss: 0.023099487647414207\n","Figure(640x480)\n","Test index 28 of 1 minibatch with batch_size of 8 time used: 0.3466987609863281\n","minibatch AVG loss: 0.020996764302253723\n","Figure(640x480)\n","Test index 29 of 1 minibatch with batch_size of 8 time used: 0.3557114601135254\n","minibatch AVG loss: 0.02975727617740631\n","Figure(640x480)\n","Test index 30 of 1 minibatch with batch_size of 8 time used: 0.35158228874206543\n","minibatch AVG loss: 0.022020038217306137\n","Figure(640x480)\n","Test index 31 of 1 minibatch with batch_size of 8 time used: 0.3453495502471924\n","minibatch AVG loss: 0.02332591824233532\n","Figure(640x480)\n","Test index 32 of 1 minibatch with batch_size of 8 time used: 0.353748083114624\n","minibatch AVG loss: 0.022864310070872307\n","Figure(640x480)\n","Test index 33 of 1 minibatch with batch_size of 8 time used: 0.3469088077545166\n","minibatch AVG loss: 0.027498576790094376\n","Figure(640x480)\n","Test index 34 of 1 minibatch with batch_size of 8 time used: 0.3442854881286621\n","minibatch AVG loss: 0.028304098173975945\n","Figure(640x480)\n","Test index 35 of 1 minibatch with batch_size of 8 time used: 0.3444187641143799\n","minibatch AVG loss: 0.024217385798692703\n","Figure(640x480)\n","Test index 36 of 1 minibatch with batch_size of 8 time used: 0.34152936935424805\n","minibatch AVG loss: 0.020597632974386215\n","Figure(640x480)\n","Test index 37 of 1 minibatch with batch_size of 8 time used: 0.34096765518188477\n","minibatch AVG loss: 0.025661714375019073\n","Figure(640x480)\n","Test index 38 of 1 minibatch with batch_size of 8 time used: 0.5249478816986084\n","minibatch AVG loss: 0.03880901634693146\n","Figure(640x480)\n","Test index 39 of 1 minibatch with batch_size of 8 time used: 0.35788869857788086\n","minibatch AVG loss: 0.04646195098757744\n","Figure(640x480)\n","Test index 40 of 1 minibatch with batch_size of 8 time used: 0.34569787979125977\n","minibatch AVG loss: 0.054550252854824066\n","Figure(640x480)\n","Test index 41 of 1 minibatch with batch_size of 8 time used: 0.3424832820892334\n","minibatch AVG loss: 0.04401148483157158\n","Figure(640x480)\n","Test index 42 of 1 minibatch with batch_size of 8 time used: 0.3472878932952881\n","minibatch AVG loss: 0.053878482431173325\n","Figure(640x480)\n","Test index 43 of 1 minibatch with batch_size of 8 time used: 0.3536717891693115\n","minibatch AVG loss: 0.04746308550238609\n","Figure(640x480)\n","Test index 44 of 1 minibatch with batch_size of 8 time used: 0.343564510345459\n","minibatch AVG loss: 0.04708326607942581\n","Figure(640x480)\n","Test index 45 of 1 minibatch with batch_size of 8 time used: 0.3419811725616455\n","minibatch AVG loss: 0.04378387704491615\n","Figure(640x480)\n","Test index 46 of 1 minibatch with batch_size of 8 time used: 0.3454432487487793\n","minibatch AVG loss: 0.043106887489557266\n","Figure(640x480)\n","Test index 47 of 1 minibatch with batch_size of 8 time used: 0.34154748916625977\n","minibatch AVG loss: 0.06279397755861282\n","Figure(640x480)\n","Test index 48 of 1 minibatch with batch_size of 8 time used: 0.5151286125183105\n","minibatch AVG loss: 0.05888595059514046\n","Figure(640x480)\n","Test index 49 of 1 minibatch with batch_size of 8 time used: 0.3433067798614502\n","minibatch AVG loss: 0.04294142872095108\n","Figure(640x480)\n","Test index 50 of 1 minibatch with batch_size of 8 time used: 0.34302759170532227\n","minibatch AVG loss: 0.056246597319841385\n","Figure(640x480)\n","\n","Test_dataset_size: 400 \n","Avg Loss: 0.0380\n","Testing time 0:00:21\n"]}],"source":["!python PuzzleTesting.py --model sae_vit_base_patch16 --PromptTuning Deep --Prompt_Token_num 20 --batch_size 8 --fix_position_ratio 0.5 --fix_patch_size 16 --enable_visualize_check --data_path /data/Pathology_Experiment/dataset/PuzzleTuning_demoset --output_dir /home/Pathology_Experiment/imaging_results --log_dir /home/Pathology_Experiment/imaging_results --checkpoint_path /home/Pathology_Experiment/runs/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20_checkpoint-39999.pth"]},{"cell_type":"markdown","metadata":{"id":"SbLcZyoHbBLR"},"source":["Load-up the ViT prompt weight from pre-trained checkpoint"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"cECU8Q9XqAB1"},"outputs":[],"source":["os.chdir(\"/home/Pathology_Experiment/code/utils\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"vhoXesRYbEal"},"outputs":[{"name":"stdout","output_type":"stream","text":["model forward cheacked\n","model is ready now!\n","checkpoint epoch 39999\n","DataParallel model loaded\n","model : ViT\n","prompt obtained\n","model trained by multi-GPUs has its single GPU copy saved at /home/Pathology_Experiment/saved_models/ViT_b16_224_timm_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_promptstate.pth\n"]}],"source":["!python transfermodel.py --given_name ViT_b16_224_timm_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_promptstate.pth --model_idx ViT --PromptTuning Deep --Prompt_Token_num 20 --edge_size 224 --checkpoint_path /home/Pathology_Experiment/runs/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20_checkpoint-39999.pth --save_model_path /home/Pathology_Experiment/saved_models"]},{"cell_type":"markdown","metadata":{"id":"Ia9qphxi86wH"},"source":["# Finetuning and comparison\n","* set up path by command line\n","* use argparse to set down hyper-parameter"]},{"cell_type":"markdown","metadata":{"id":"akuwL6GlbF8h"},"source":["## Finetuning without PuzzleTuning"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"yvoJWsQNrzhP"},"outputs":[],"source":["os.chdir(\"/home/Pathology_Experiment/code\")"]},{"cell_type":"markdown","metadata":{"id":"5cfIDQpn2L5H"},"source":["### ViT (with timm weight)"]},{"cell_type":"markdown","metadata":{"id":"0pgSCO1EbNSm"},"source":["Train"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"xTNPSavybFTm"},"outputs":[{"name":"stdout","output_type":"stream","text":["class_names: ['benign', 'malignant']\n","*********************************setting*************************************\n","Namespace(model_idx='ViT_base_timm_401_lf15_finetuning_warwick_CLS', drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, cls_token_off=False, pos_embedding_off=False, att_module='SimAM', backbone_PT_off=False, gpu_idx=-1, dataroot='/data/Pathology_Experiment/dataset/warwick_CLS', model_path='/home/Pathology_Experiment/saved_models', draw_root='/home/Pathology_Experiment/runs/ViT_finetuning_with_timm', paint=True, enable_tensorboard=True, enable_attention_check=False, enable_visualize_check=False, PromptTuning=None, Prompt_Token_num=20, PromptUnFreeze=False, linearprobing=False, Pre_Trained_model_path=None, Prompt_state_path=None, enable_sam=False, augmentation_name=None, ratio_strategy=None, patch_strategy=None, loss_drive_threshold=4.0, fix_position_ratio=0.5, fix_patch_size=None, patch_size_jump=None, num_classes=0, edge_size=224, data_augmentation_mode=3, batch_size=8, num_epochs=50, intake_epochs=0, lr=1e-05, lrf=0.15, opt_name='Adam', check_minibatch=None, num_workers=2)\n","we dont have more GPU idx here, try to use gpu_idx=0\n","['convit_base',\n"," 'convit_small',\n"," 'convit_tiny',\n"," 'crossvit_9_240',\n"," 'crossvit_9_dagger_240',\n"," 'crossvit_15_240',\n"," 'crossvit_15_dagger_240',\n"," 'crossvit_15_dagger_408',\n"," 'crossvit_18_240',\n"," 'crossvit_18_dagger_240',\n"," 'crossvit_18_dagger_408',\n"," 'crossvit_base_240',\n"," 'crossvit_small_240',\n"," 'crossvit_tiny_240',\n"," 'levit_128',\n"," 'levit_128s',\n"," 'levit_192',\n"," 'levit_256',\n"," 'levit_384',\n"," 'vit_base_patch8_224',\n"," 'vit_base_patch8_224_in21k',\n"," 'vit_base_patch16_224',\n"," 'vit_base_patch16_224_in21k',\n"," 'vit_base_patch16_224_miil',\n"," 'vit_base_patch16_224_miil_in21k',\n"," 'vit_base_patch16_384',\n"," 'vit_base_patch16_sam_224',\n"," 'vit_base_patch32_224',\n"," 'vit_base_patch32_224_in21k',\n"," 'vit_base_patch32_384',\n"," 'vit_base_patch32_sam_224',\n"," 'vit_base_r26_s32_224',\n"," 'vit_base_r50_s16_224',\n"," 'vit_base_r50_s16_224_in21k',\n"," 'vit_base_r50_s16_384',\n"," 'vit_base_resnet26d_224',\n"," 'vit_base_resnet50_224_in21k',\n"," 'vit_base_resnet50_384',\n"," 'vit_base_resnet50d_224',\n"," 'vit_giant_patch14_224',\n"," 'vit_gigantic_patch14_224',\n"," 'vit_huge_patch14_224',\n"," 'vit_huge_patch14_224_in21k',\n"," 'vit_large_patch16_224',\n"," 'vit_large_patch16_224_in21k',\n"," 'vit_large_patch16_384',\n"," 'vit_large_patch32_224',\n"," 'vit_large_patch32_224_in21k',\n"," 'vit_large_patch32_384',\n"," 'vit_large_r50_s32_224',\n"," 'vit_large_r50_s32_224_in21k',\n"," 'vit_large_r50_s32_384',\n"," 'vit_small_patch16_224',\n"," 'vit_small_patch16_224_in21k',\n"," 'vit_small_patch16_384',\n"," 'vit_small_patch32_224',\n"," 'vit_small_patch32_224_in21k',\n"," 'vit_small_patch32_384',\n"," 'vit_small_r26_s32_224',\n"," 'vit_small_r26_s32_224_in21k',\n"," 'vit_small_r26_s32_384',\n"," 'vit_small_resnet26d_224',\n"," 'vit_small_resnet50d_s16_224',\n"," 'vit_tiny_patch16_224',\n"," 'vit_tiny_patch16_224_in21k',\n"," 'vit_tiny_patch16_384',\n"," 'vit_tiny_r_s16_p8_224',\n"," 'vit_tiny_r_s16_p8_224_in21k',\n"," 'vit_tiny_r_s16_p8_384']\n","test model output: tensor([[-0.4719, 2.0119]], grad_fn=\u003cAddmmBackward0\u003e)\n","model is ready now!\n","building model (no-prompt) with pretrained_backbone status: True\n","timm loaded\n","GPU: 0\n","----------------------------------------------------------------\n"," Layer (type) Output Shape Param #\n","================================================================\n"," Conv2d-1 [-1, 768, 14, 14] 590,592\n"," Identity-2 [-1, 196, 768] 0\n"," PatchEmbed-3 [-1, 196, 768] 0\n"," Dropout-4 [-1, 197, 768] 0\n"," LayerNorm-5 [-1, 197, 768] 1,536\n"," Linear-6 [-1, 197, 2304] 1,771,776\n"," Dropout-7 [-1, 12, 197, 197] 0\n"," Linear-8 [-1, 197, 768] 590,592\n"," Dropout-9 [-1, 197, 768] 0\n"," Attention-10 [-1, 197, 768] 0\n"," Identity-11 [-1, 197, 768] 0\n"," LayerNorm-12 [-1, 197, 768] 1,536\n"," Linear-13 [-1, 197, 3072] 2,362,368\n"," GELU-14 [-1, 197, 3072] 0\n"," Dropout-15 [-1, 197, 3072] 0\n"," Linear-16 [-1, 197, 768] 2,360,064\n"," Dropout-17 [-1, 197, 768] 0\n"," Mlp-18 [-1, 197, 768] 0\n"," Identity-19 [-1, 197, 768] 0\n"," Block-20 [-1, 197, 768] 0\n"," LayerNorm-21 [-1, 197, 768] 1,536\n"," Linear-22 [-1, 197, 2304] 1,771,776\n"," Dropout-23 [-1, 12, 197, 197] 0\n"," Linear-24 [-1, 197, 768] 590,592\n"," Dropout-25 [-1, 197, 768] 0\n"," Attention-26 [-1, 197, 768] 0\n"," Identity-27 [-1, 197, 768] 0\n"," LayerNorm-28 [-1, 197, 768] 1,536\n"," Linear-29 [-1, 197, 3072] 2,362,368\n"," GELU-30 [-1, 197, 3072] 0\n"," Dropout-31 [-1, 197, 3072] 0\n"," Linear-32 [-1, 197, 768] 2,360,064\n"," Dropout-33 [-1, 197, 768] 0\n"," Mlp-34 [-1, 197, 768] 0\n"," Identity-35 [-1, 197, 768] 0\n"," Block-36 [-1, 197, 768] 0\n"," LayerNorm-37 [-1, 197, 768] 1,536\n"," Linear-38 [-1, 197, 2304] 1,771,776\n"," Dropout-39 [-1, 12, 197, 197] 0\n"," Linear-40 [-1, 197, 768] 590,592\n"," Dropout-41 [-1, 197, 768] 0\n"," Attention-42 [-1, 197, 768] 0\n"," Identity-43 [-1, 197, 768] 0\n"," LayerNorm-44 [-1, 197, 768] 1,536\n"," Linear-45 [-1, 197, 3072] 2,362,368\n"," GELU-46 [-1, 197, 3072] 0\n"," Dropout-47 [-1, 197, 3072] 0\n"," Linear-48 [-1, 197, 768] 2,360,064\n"," Dropout-49 [-1, 197, 768] 0\n"," Mlp-50 [-1, 197, 768] 0\n"," Identity-51 [-1, 197, 768] 0\n"," Block-52 [-1, 197, 768] 0\n"," LayerNorm-53 [-1, 197, 768] 1,536\n"," Linear-54 [-1, 197, 2304] 1,771,776\n"," Dropout-55 [-1, 12, 197, 197] 0\n"," Linear-56 [-1, 197, 768] 590,592\n"," Dropout-57 [-1, 197, 768] 0\n"," Attention-58 [-1, 197, 768] 0\n"," Identity-59 [-1, 197, 768] 0\n"," LayerNorm-60 [-1, 197, 768] 1,536\n"," Linear-61 [-1, 197, 3072] 2,362,368\n"," GELU-62 [-1, 197, 3072] 0\n"," Dropout-63 [-1, 197, 3072] 0\n"," Linear-64 [-1, 197, 768] 2,360,064\n"," Dropout-65 [-1, 197, 768] 0\n"," Mlp-66 [-1, 197, 768] 0\n"," Identity-67 [-1, 197, 768] 0\n"," Block-68 [-1, 197, 768] 0\n"," LayerNorm-69 [-1, 197, 768] 1,536\n"," Linear-70 [-1, 197, 2304] 1,771,776\n"," Dropout-71 [-1, 12, 197, 197] 0\n"," Linear-72 [-1, 197, 768] 590,592\n"," Dropout-73 [-1, 197, 768] 0\n"," Attention-74 [-1, 197, 768] 0\n"," Identity-75 [-1, 197, 768] 0\n"," LayerNorm-76 [-1, 197, 768] 1,536\n"," Linear-77 [-1, 197, 3072] 2,362,368\n"," GELU-78 [-1, 197, 3072] 0\n"," Dropout-79 [-1, 197, 3072] 0\n"," Linear-80 [-1, 197, 768] 2,360,064\n"," Dropout-81 [-1, 197, 768] 0\n"," Mlp-82 [-1, 197, 768] 0\n"," Identity-83 [-1, 197, 768] 0\n"," Block-84 [-1, 197, 768] 0\n"," LayerNorm-85 [-1, 197, 768] 1,536\n"," Linear-86 [-1, 197, 2304] 1,771,776\n"," Dropout-87 [-1, 12, 197, 197] 0\n"," Linear-88 [-1, 197, 768] 590,592\n"," Dropout-89 [-1, 197, 768] 0\n"," Attention-90 [-1, 197, 768] 0\n"," Identity-91 [-1, 197, 768] 0\n"," LayerNorm-92 [-1, 197, 768] 1,536\n"," Linear-93 [-1, 197, 3072] 2,362,368\n"," GELU-94 [-1, 197, 3072] 0\n"," Dropout-95 [-1, 197, 3072] 0\n"," Linear-96 [-1, 197, 768] 2,360,064\n"," Dropout-97 [-1, 197, 768] 0\n"," Mlp-98 [-1, 197, 768] 0\n"," Identity-99 [-1, 197, 768] 0\n"," Block-100 [-1, 197, 768] 0\n"," LayerNorm-101 [-1, 197, 768] 1,536\n"," Linear-102 [-1, 197, 2304] 1,771,776\n"," Dropout-103 [-1, 12, 197, 197] 0\n"," Linear-104 [-1, 197, 768] 590,592\n"," Dropout-105 [-1, 197, 768] 0\n"," Attention-106 [-1, 197, 768] 0\n"," Identity-107 [-1, 197, 768] 0\n"," LayerNorm-108 [-1, 197, 768] 1,536\n"," Linear-109 [-1, 197, 3072] 2,362,368\n"," GELU-110 [-1, 197, 3072] 0\n"," Dropout-111 [-1, 197, 3072] 0\n"," Linear-112 [-1, 197, 768] 2,360,064\n"," Dropout-113 [-1, 197, 768] 0\n"," Mlp-114 [-1, 197, 768] 0\n"," Identity-115 [-1, 197, 768] 0\n"," Block-116 [-1, 197, 768] 0\n"," LayerNorm-117 [-1, 197, 768] 1,536\n"," Linear-118 [-1, 197, 2304] 1,771,776\n"," Dropout-119 [-1, 12, 197, 197] 0\n"," Linear-120 [-1, 197, 768] 590,592\n"," Dropout-121 [-1, 197, 768] 0\n"," Attention-122 [-1, 197, 768] 0\n"," Identity-123 [-1, 197, 768] 0\n"," LayerNorm-124 [-1, 197, 768] 1,536\n"," Linear-125 [-1, 197, 3072] 2,362,368\n"," GELU-126 [-1, 197, 3072] 0\n"," Dropout-127 [-1, 197, 3072] 0\n"," Linear-128 [-1, 197, 768] 2,360,064\n"," Dropout-129 [-1, 197, 768] 0\n"," Mlp-130 [-1, 197, 768] 0\n"," Identity-131 [-1, 197, 768] 0\n"," Block-132 [-1, 197, 768] 0\n"," LayerNorm-133 [-1, 197, 768] 1,536\n"," Linear-134 [-1, 197, 2304] 1,771,776\n"," Dropout-135 [-1, 12, 197, 197] 0\n"," Linear-136 [-1, 197, 768] 590,592\n"," Dropout-137 [-1, 197, 768] 0\n"," Attention-138 [-1, 197, 768] 0\n"," Identity-139 [-1, 197, 768] 0\n"," LayerNorm-140 [-1, 197, 768] 1,536\n"," Linear-141 [-1, 197, 3072] 2,362,368\n"," GELU-142 [-1, 197, 3072] 0\n"," Dropout-143 [-1, 197, 3072] 0\n"," Linear-144 [-1, 197, 768] 2,360,064\n"," Dropout-145 [-1, 197, 768] 0\n"," Mlp-146 [-1, 197, 768] 0\n"," Identity-147 [-1, 197, 768] 0\n"," Block-148 [-1, 197, 768] 0\n"," LayerNorm-149 [-1, 197, 768] 1,536\n"," Linear-150 [-1, 197, 2304] 1,771,776\n"," Dropout-151 [-1, 12, 197, 197] 0\n"," Linear-152 [-1, 197, 768] 590,592\n"," Dropout-153 [-1, 197, 768] 0\n"," Attention-154 [-1, 197, 768] 0\n"," Identity-155 [-1, 197, 768] 0\n"," LayerNorm-156 [-1, 197, 768] 1,536\n"," Linear-157 [-1, 197, 3072] 2,362,368\n"," GELU-158 [-1, 197, 3072] 0\n"," Dropout-159 [-1, 197, 3072] 0\n"," Linear-160 [-1, 197, 768] 2,360,064\n"," Dropout-161 [-1, 197, 768] 0\n"," Mlp-162 [-1, 197, 768] 0\n"," Identity-163 [-1, 197, 768] 0\n"," Block-164 [-1, 197, 768] 0\n"," LayerNorm-165 [-1, 197, 768] 1,536\n"," Linear-166 [-1, 197, 2304] 1,771,776\n"," Dropout-167 [-1, 12, 197, 197] 0\n"," Linear-168 [-1, 197, 768] 590,592\n"," Dropout-169 [-1, 197, 768] 0\n"," Attention-170 [-1, 197, 768] 0\n"," Identity-171 [-1, 197, 768] 0\n"," LayerNorm-172 [-1, 197, 768] 1,536\n"," Linear-173 [-1, 197, 3072] 2,362,368\n"," GELU-174 [-1, 197, 3072] 0\n"," Dropout-175 [-1, 197, 3072] 0\n"," Linear-176 [-1, 197, 768] 2,360,064\n"," Dropout-177 [-1, 197, 768] 0\n"," Mlp-178 [-1, 197, 768] 0\n"," Identity-179 [-1, 197, 768] 0\n"," Block-180 [-1, 197, 768] 0\n"," LayerNorm-181 [-1, 197, 768] 1,536\n"," Linear-182 [-1, 197, 2304] 1,771,776\n"," Dropout-183 [-1, 12, 197, 197] 0\n"," Linear-184 [-1, 197, 768] 590,592\n"," Dropout-185 [-1, 197, 768] 0\n"," Attention-186 [-1, 197, 768] 0\n"," Identity-187 [-1, 197, 768] 0\n"," LayerNorm-188 [-1, 197, 768] 1,536\n"," Linear-189 [-1, 197, 3072] 2,362,368\n"," GELU-190 [-1, 197, 3072] 0\n"," Dropout-191 [-1, 197, 3072] 0\n"," Linear-192 [-1, 197, 768] 2,360,064\n"," Dropout-193 [-1, 197, 768] 0\n"," Mlp-194 [-1, 197, 768] 0\n"," Identity-195 [-1, 197, 768] 0\n"," Block-196 [-1, 197, 768] 0\n"," LayerNorm-197 [-1, 197, 768] 1,536\n"," Identity-198 [-1, 768] 0\n"," Linear-199 [-1, 2] 1,538\n","================================================================\n","Total params: 85,648,130\n","Trainable params: 85,648,130\n","Non-trainable params: 0\n","----------------------------------------------------------------\n","Input size (MB): 0.57\n","Forward/backward pass size (MB): 408.54\n","Params size (MB): 326.72\n","Estimated Total Size (MB): 735.83\n","----------------------------------------------------------------\n","model : ViT_base_timm_401_lf15_finetuning_warwick_CLS\n","no valid counterparts augmentation selected\n","Epoch 1/50\n","----------\n","\n","Epoch: 1 train \n","Loss: 0.4487 Acc: 71.0145\n","benign precision: 76.9231 recall: 68.9655\n","benign sensitivity: 68.9655 specificity: 82.8571\n","benign FPR: 17.1429 NPV: 76.3158\n","benign TP: 20.0\n","benign TN: 29.0\n","benign FP: 6.0\n","benign FN: 9.0\n","malignant precision: 76.3158 recall: 82.8571\n","malignant sensitivity: 82.8571 specificity: 68.9655\n","malignant FPR: 31.0345 NPV: 76.9231\n","malignant TP: 29.0\n","malignant TN: 20.0\n","malignant FP: 9.0\n","malignant FN: 6.0\n","\n","\n","\n","Epoch: 1 val \n","Loss: 0.0832 Acc: 93.7500\n","benign precision: 87.5000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 88.8889\n","benign FPR: 11.1111 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 8.0\n","benign FP: 1.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 88.8889\n","malignant sensitivity: 88.8889 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 87.5000\n","malignant TP: 8.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 1.0\n","\n","\n","\n","Epoch 2/50\n","----------\n","\n","Epoch: 2 train \n","Loss: 0.0566 Acc: 91.3043\n","benign precision: 96.4286 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 97.2973\n","benign FPR: 2.7027 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 36.0\n","benign FP: 1.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 97.2973\n","malignant sensitivity: 97.2973 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 96.4286\n","malignant TP: 36.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 1.0\n","\n","\n","\n","Epoch: 2 val \n","Loss: 0.2076 Acc: 87.5000\n","benign precision: 100.0000 recall: 71.4286\n","benign sensitivity: 71.4286 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 81.8182\n","benign TP: 5.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 2.0\n","malignant precision: 81.8182 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 71.4286\n","malignant FPR: 28.5714 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 5.0\n","malignant FP: 2.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 3/50\n","----------\n","\n","Epoch: 3 train \n","Loss: 0.0412 Acc: 89.8551\n","benign precision: 100.0000 recall: 93.1034\n","benign sensitivity: 93.1034 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 94.5946\n","benign TP: 27.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 2.0\n","malignant precision: 94.5946 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 93.1034\n","malignant FPR: 6.8966 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 27.0\n","malignant FP: 2.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 3 val \n","Loss: 0.0047 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 4/50\n","----------\n","\n","Epoch: 4 train \n","Loss: 0.0056 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 4 val \n","Loss: 0.0107 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 5/50\n","----------\n","\n","Epoch: 5 train \n","Loss: 0.0021 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 26.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 26.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 5 val \n","Loss: 0.0044 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 6/50\n","----------\n","\n","Epoch: 6 train \n","Loss: 0.0006 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 6 val \n","Loss: 0.0027 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 7/50\n","----------\n","\n","Epoch: 7 train \n","Loss: 0.0004 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 7 val \n","Loss: 0.0024 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 8/50\n","----------\n","\n","Epoch: 8 train \n","Loss: 0.0004 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 26.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 26.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 8 val \n","Loss: 0.0025 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 9/50\n","----------\n","\n","Epoch: 9 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 9 val \n","Loss: 0.0026 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 10/50\n","----------\n","\n","Epoch: 10 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 10 val \n","Loss: 0.0028 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 11/50\n","----------\n","\n","Epoch: 11 train \n","Loss: 0.0007 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 11 val \n","Loss: 0.0027 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 12/50\n","----------\n","\n","Epoch: 12 train \n","Loss: 0.0003 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 12 val \n","Loss: 0.0024 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 13/50\n","----------\n","\n","Epoch: 13 train \n","Loss: 0.0003 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 13 val \n","Loss: 0.0024 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 14/50\n","----------\n","\n","Epoch: 14 train \n","Loss: 0.0004 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 14 val \n","Loss: 0.0024 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 15/50\n","----------\n","\n","Epoch: 15 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 15 val \n","Loss: 0.0024 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 16/50\n","----------\n","\n","Epoch: 16 train \n","Loss: 0.0005 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 16 val \n","Loss: 0.0024 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 17/50\n","----------\n","\n","Epoch: 17 train \n","Loss: 0.0003 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 17 val \n","Loss: 0.0024 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 18/50\n","----------\n","\n","Epoch: 18 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 26.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 26.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 18 val \n","Loss: 0.0024 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 19/50\n","----------\n","\n","Epoch: 19 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 19 val \n","Loss: 0.0025 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 20/50\n","----------\n","\n","Epoch: 20 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 20 val \n","Loss: 0.0025 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 21/50\n","----------\n","\n","Epoch: 21 train \n","Loss: 0.0005 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 21 val \n","Loss: 0.0024 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 22/50\n","----------\n","\n","Epoch: 22 train \n","Loss: 0.0003 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 22 val \n","Loss: 0.0025 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 23/50\n","----------\n","\n","Epoch: 23 train \n","Loss: 0.0003 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 23 val \n","Loss: 0.0025 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 24/50\n","----------\n","\n","Epoch: 24 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 24 val \n","Loss: 0.0026 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 25/50\n","----------\n","\n","Epoch: 25 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 25 val \n","Loss: 0.0026 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 26/50\n","----------\n","\n","Epoch: 26 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 26 val \n","Loss: 0.0026 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 27/50\n","----------\n","\n","Epoch: 27 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 27 val \n","Loss: 0.0026 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 28/50\n","----------\n","\n","Epoch: 28 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 28 val \n","Loss: 0.0026 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 29/50\n","----------\n","\n","Epoch: 29 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 29 val \n","Loss: 0.0026 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 30/50\n","----------\n","\n","Epoch: 30 train \n","Loss: 0.0003 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 30 val \n","Loss: 0.0026 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 31/50\n","----------\n","\n","Epoch: 31 train \n","Loss: 0.0003 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 31 val \n","Loss: 0.0025 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 32/50\n","----------\n","\n","Epoch: 32 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 32 val \n","Loss: 0.0025 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 33/50\n","----------\n","\n","Epoch: 33 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 33 val \n","Loss: 0.0025 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 34/50\n","----------\n","\n","Epoch: 34 train \n","Loss: 0.0003 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 34 val \n","Loss: 0.0025 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 35/50\n","----------\n","\n","Epoch: 35 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 35 val \n","Loss: 0.0025 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 36/50\n","----------\n","\n","Epoch: 36 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 36 val \n","Loss: 0.0024 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 37/50\n","----------\n","\n","Epoch: 37 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 37 val \n","Loss: 0.0024 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 38/50\n","----------\n","\n","Epoch: 38 train \n","Loss: 0.0003 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 38 val \n","Loss: 0.0024 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 39/50\n","----------\n","\n","Epoch: 39 train \n","Loss: 0.0004 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 39 val \n","Loss: 0.0024 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 40/50\n","----------\n","\n","Epoch: 40 train \n","Loss: 0.0003 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 40 val \n","Loss: 0.0023 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 41/50\n","----------\n","\n","Epoch: 41 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 41 val \n","Loss: 0.0023 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 42/50\n","----------\n","\n","Epoch: 42 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 42 val \n","Loss: 0.0023 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 43/50\n","----------\n","\n","Epoch: 43 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 43 val \n","Loss: 0.0023 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 44/50\n","----------\n","\n","Epoch: 44 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 30.0\n","benign TN: 34.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 34.0\n","malignant TN: 30.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 44 val \n","Loss: 0.0023 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 45/50\n","----------\n","\n","Epoch: 45 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 45 val \n","Loss: 0.0023 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 46/50\n","----------\n","\n","Epoch: 46 train \n","Loss: 0.0003 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 46 val \n","Loss: 0.0023 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 47/50\n","----------\n","\n","Epoch: 47 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 47 val \n","Loss: 0.0023 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 48/50\n","----------\n","\n","Epoch: 48 train \n","Loss: 0.0003 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 48 val \n","Loss: 0.0023 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 49/50\n","----------\n","\n","Epoch: 49 train \n","Loss: 0.0003 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 49 val \n","Loss: 0.0023 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 50/50\n","----------\n","\n","Epoch: 50 train \n","Loss: 0.0003 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 50 val \n","Loss: 0.0023 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Training complete in 0m 53s\n","Best epoch idx: 50\n","Best epoch train Acc: 92.753623\n","Best epoch val Acc: 100.000000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","model trained by GPU (idx:0) has been saved at /home/Pathology_Experiment/saved_models/CLS_ViT_base_timm_401_lf15_finetuning_warwick_CLS.pth\n"]}],"source":["!python Train.py --edge_size 224 --data_augmentation_mode 3 --lr 1e-05 --lrf 0.15 --enable_tensorboard --model_idx ViT_base_timm_401_lf15_finetuning_warwick_CLS --dataroot /data/Pathology_Experiment/dataset/warwick_CLS --draw_root /home/Pathology_Experiment/runs/ViT_finetuning_with_timm --model_path /home/Pathology_Experiment/saved_models"]},{"cell_type":"markdown","metadata":{"id":"Di0tovUQbPb1"},"source":["Test"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"3HeuBzgsbQJ6"},"outputs":[{"name":"stdout","output_type":"stream","text":["class_names: ['benign', 'malignant']\n","['convit_base',\n"," 'convit_small',\n"," 'convit_tiny',\n"," 'crossvit_9_240',\n"," 'crossvit_9_dagger_240',\n"," 'crossvit_15_240',\n"," 'crossvit_15_dagger_240',\n"," 'crossvit_15_dagger_408',\n"," 'crossvit_18_240',\n"," 'crossvit_18_dagger_240',\n"," 'crossvit_18_dagger_408',\n"," 'crossvit_base_240',\n"," 'crossvit_small_240',\n"," 'crossvit_tiny_240',\n"," 'levit_128',\n"," 'levit_128s',\n"," 'levit_192',\n"," 'levit_256',\n"," 'levit_384',\n"," 'vit_base_patch8_224',\n"," 'vit_base_patch8_224_in21k',\n"," 'vit_base_patch16_224',\n"," 'vit_base_patch16_224_in21k',\n"," 'vit_base_patch16_224_miil',\n"," 'vit_base_patch16_224_miil_in21k',\n"," 'vit_base_patch16_384',\n"," 'vit_base_patch16_sam_224',\n"," 'vit_base_patch32_224',\n"," 'vit_base_patch32_224_in21k',\n"," 'vit_base_patch32_384',\n"," 'vit_base_patch32_sam_224',\n"," 'vit_base_r26_s32_224',\n"," 'vit_base_r50_s16_224',\n"," 'vit_base_r50_s16_224_in21k',\n"," 'vit_base_r50_s16_384',\n"," 'vit_base_resnet26d_224',\n"," 'vit_base_resnet50_224_in21k',\n"," 'vit_base_resnet50_384',\n"," 'vit_base_resnet50d_224',\n"," 'vit_giant_patch14_224',\n"," 'vit_gigantic_patch14_224',\n"," 'vit_huge_patch14_224',\n"," 'vit_huge_patch14_224_in21k',\n"," 'vit_large_patch16_224',\n"," 'vit_large_patch16_224_in21k',\n"," 'vit_large_patch16_384',\n"," 'vit_large_patch32_224',\n"," 'vit_large_patch32_224_in21k',\n"," 'vit_large_patch32_384',\n"," 'vit_large_r50_s32_224',\n"," 'vit_large_r50_s32_224_in21k',\n"," 'vit_large_r50_s32_384',\n"," 'vit_small_patch16_224',\n"," 'vit_small_patch16_224_in21k',\n"," 'vit_small_patch16_384',\n"," 'vit_small_patch32_224',\n"," 'vit_small_patch32_224_in21k',\n"," 'vit_small_patch32_384',\n"," 'vit_small_r26_s32_224',\n"," 'vit_small_r26_s32_224_in21k',\n"," 'vit_small_r26_s32_384',\n"," 'vit_small_resnet26d_224',\n"," 'vit_small_resnet50d_s16_224',\n"," 'vit_tiny_patch16_224',\n"," 'vit_tiny_patch16_224_in21k',\n"," 'vit_tiny_patch16_384',\n"," 'vit_tiny_r_s16_p8_224',\n"," 'vit_tiny_r_s16_p8_224_in21k',\n"," 'vit_tiny_r_s16_p8_384']\n","test model output: tensor([[-0.2566, -0.2077]], grad_fn=\u003cAddmmBackward0\u003e)\n","model is ready now!\n","model loaded\n","model : ViT_base_timm_401_lf15_finetuning_warwick_CLS\n","*********************************setting*************************************\n","Namespace(model_idx='ViT_base_timm_401_lf15_finetuning_warwick_CLS', drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, cls_token_off=False, pos_embedding_off=False, att_module='SimAM', gpu_idx=0, dataroot='/data/Pathology_Experiment/dataset/warwick_CLS', model_path='/home/Pathology_Experiment/saved_models', draw_root='/home/Pathology_Experiment/runs/ViT_finetuning_with_timm', model_path_by_hand=None, paint=True, enable_tensorboard=False, enable_attention_check=False, enable_visualize_check=False, data_augmentation_mode=3, PromptTuning=None, Prompt_Token_num=20, PromptUnFreeze=False, Pre_Trained_model_path=None, num_classes=0, edge_size=224, batch_size=1, check_minibatch=None)\n","Epoch: Test\n","----------\n","Epoch: test test index of 4 minibatch: 1 time used: 2.0843310356140137\n","minibatch AVG loss: 0.049242537235841155\n","Epoch: test test index of 4 minibatch: 2 time used: 0.04948902130126953\n","minibatch AVG loss: 0.0026592123062982864\n","Epoch: test test index of 4 minibatch: 3 time used: 0.049538612365722656\n","minibatch AVG loss: 7.384463651760598e-05\n","Epoch: test test index of 4 minibatch: 4 time used: 0.04914069175720215\n","minibatch AVG loss: 0.03960687841677668\n","Epoch: test test index of 4 minibatch: 5 time used: 0.04950857162475586\n","minibatch AVG loss: 0.20808867284495136\n","Epoch: test test index of 4 minibatch: 6 time used: 0.04882478713989258\n","minibatch AVG loss: 0.00309721480675762\n","Epoch: test test index of 4 minibatch: 7 time used: 0.05091118812561035\n","minibatch AVG loss: 0.09648202347295864\n","Epoch: test test index of 4 minibatch: 8 time used: 0.047635555267333984\n","minibatch AVG loss: 0.000145436038110347\n","Epoch: test test index of 4 minibatch: 9 time used: 0.04693150520324707\n","minibatch AVG loss: 0.0013968248640594538\n","Epoch: test test index of 4 minibatch: 10 time used: 0.04672741889953613\n","minibatch AVG loss: 0.001999404456000775\n","Epoch: test test index of 4 minibatch: 11 time used: 0.04696464538574219\n","minibatch AVG loss: 0.012991284020245075\n","Epoch: test test index of 4 minibatch: 12 time used: 0.04732871055603027\n","minibatch AVG loss: 0.0032983069049805636\n","Epoch: test test index of 4 minibatch: 13 time used: 0.04779481887817383\n","minibatch AVG loss: 0.0027486889557621907\n","Epoch: test test index of 4 minibatch: 14 time used: 0.04719042778015137\n","minibatch AVG loss: 0.002299645588209387\n","Epoch: test test index of 4 minibatch: 15 time used: 0.04710078239440918\n","minibatch AVG loss: 0.0003831974645436276\n","Epoch: test test index of 4 minibatch: 16 time used: 0.04697871208190918\n","minibatch AVG loss: 0.0005818266072310507\n","Epoch: test test index of 4 minibatch: 17 time used: 0.048592329025268555\n","minibatch AVG loss: 0.0008116918361338321\n","Epoch: test test index of 4 minibatch: 18 time used: 0.048664093017578125\n","minibatch AVG loss: 0.012041694470099173\n","Epoch: test test index of 4 minibatch: 19 time used: 0.04766678810119629\n","minibatch AVG loss: 6.261242378968745e-05\n","Epoch: test test index of 4 minibatch: 20 time used: 0.0470576286315918\n","minibatch AVG loss: 0.022635395129327662\n","\n","Epoch: test \n","Loss: 0.0230 Acc: 98.7500\n","benign precision: 100.0000 recall: 97.2973\n","benign sensitivity: 97.2973 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 97.7273\n","benign TP: 36.0\n","benign TN: 43.0\n","benign FP: 0.0\n","benign FN: 1.0\n","malignant precision: 97.7273 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 97.2973\n","malignant FPR: 2.7027 NPV: 100.0000\n","malignant TP: 43.0\n","malignant TN: 36.0\n","malignant FP: 1.0\n","malignant FN: 0.0\n","\n","\n","Testing complete in 0m 3s\n"]}],"source":["!python Test.py --edge_size 224 --data_augmentation_mode 3 --model_idx ViT_base_timm_401_lf15_finetuning_warwick_CLS --dataroot /data/Pathology_Experiment/dataset/warwick_CLS --draw_root /home/Pathology_Experiment/runs/ViT_finetuning_with_timm --model_path /home/Pathology_Experiment/saved_models"]},{"cell_type":"markdown","metadata":{"id":"I4siigu52Od8"},"source":["### VPT + finetuning (with timm weight)"]},{"cell_type":"markdown","metadata":{"id":"j8tEUJf22XL-"},"source":["Train"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"3m2pR6ba2QWf"},"outputs":[{"name":"stdout","output_type":"stream","text":["class_names: ['benign', 'malignant']\n","*********************************setting*************************************\n","Namespace(model_idx='ViT_base_timm_PromptDeep_20_401_lf15_finetuning_warwick_CLS', drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, cls_token_off=False, pos_embedding_off=False, att_module='SimAM', backbone_PT_off=False, gpu_idx=-1, dataroot='/data/Pathology_Experiment/dataset/warwick_CLS', model_path='/home/Pathology_Experiment/saved_models', draw_root='/home/Pathology_Experiment/runs/VPT_finetuning_with_timm', paint=True, enable_tensorboard=True, enable_attention_check=False, enable_visualize_check=False, PromptTuning='Deep', Prompt_Token_num=20, PromptUnFreeze=False, linearprobing=False, Pre_Trained_model_path=None, Prompt_state_path=None, enable_sam=False, augmentation_name=None, ratio_strategy=None, patch_strategy=None, loss_drive_threshold=4.0, fix_position_ratio=0.5, fix_patch_size=None, patch_size_jump=None, num_classes=0, edge_size=224, data_augmentation_mode=3, batch_size=8, num_epochs=50, intake_epochs=0, lr=1e-05, lrf=0.15, opt_name='Adam', check_minibatch=None, num_workers=2)\n","we dont have more GPU idx here, try to use gpu_idx=0\n","PromptTuning of ViT_base_timm_PromptDeep_20_401_lf15_finetuning_warwick_CLS\n","Prompt VPT type: Deep\n","backbone base_state_dict of timm\n","prompting with empty prompt_state: prompt_state of None\n","in prompt model building, timm ViT loaded for base_state_dict\n","model forward cheacked\n","model is ready now!\n","GPU: 0\n","----------------------------------------------------------------\n"," Layer (type) Output Shape Param #\n","================================================================\n"," Conv2d-1 [-1, 768, 14, 14] 590,592\n"," Identity-2 [-1, 196, 768] 0\n"," PatchEmbed-3 [-1, 196, 768] 0\n"," Dropout-4 [-1, 197, 768] 0\n"," LayerNorm-5 [-1, 217, 768] 1,536\n"," Linear-6 [-1, 217, 2304] 1,771,776\n"," Dropout-7 [-1, 12, 217, 217] 0\n"," Linear-8 [-1, 217, 768] 590,592\n"," Dropout-9 [-1, 217, 768] 0\n"," Attention-10 [-1, 217, 768] 0\n"," Identity-11 [-1, 217, 768] 0\n"," LayerNorm-12 [-1, 217, 768] 1,536\n"," Linear-13 [-1, 217, 3072] 2,362,368\n"," GELU-14 [-1, 217, 3072] 0\n"," Dropout-15 [-1, 217, 3072] 0\n"," Linear-16 [-1, 217, 768] 2,360,064\n"," Dropout-17 [-1, 217, 768] 0\n"," Mlp-18 [-1, 217, 768] 0\n"," Identity-19 [-1, 217, 768] 0\n"," Block-20 [-1, 217, 768] 0\n"," LayerNorm-21 [-1, 217, 768] 1,536\n"," Linear-22 [-1, 217, 2304] 1,771,776\n"," Dropout-23 [-1, 12, 217, 217] 0\n"," Linear-24 [-1, 217, 768] 590,592\n"," Dropout-25 [-1, 217, 768] 0\n"," Attention-26 [-1, 217, 768] 0\n"," Identity-27 [-1, 217, 768] 0\n"," LayerNorm-28 [-1, 217, 768] 1,536\n"," Linear-29 [-1, 217, 3072] 2,362,368\n"," GELU-30 [-1, 217, 3072] 0\n"," Dropout-31 [-1, 217, 3072] 0\n"," Linear-32 [-1, 217, 768] 2,360,064\n"," Dropout-33 [-1, 217, 768] 0\n"," Mlp-34 [-1, 217, 768] 0\n"," Identity-35 [-1, 217, 768] 0\n"," Block-36 [-1, 217, 768] 0\n"," LayerNorm-37 [-1, 217, 768] 1,536\n"," Linear-38 [-1, 217, 2304] 1,771,776\n"," Dropout-39 [-1, 12, 217, 217] 0\n"," Linear-40 [-1, 217, 768] 590,592\n"," Dropout-41 [-1, 217, 768] 0\n"," Attention-42 [-1, 217, 768] 0\n"," Identity-43 [-1, 217, 768] 0\n"," LayerNorm-44 [-1, 217, 768] 1,536\n"," Linear-45 [-1, 217, 3072] 2,362,368\n"," GELU-46 [-1, 217, 3072] 0\n"," Dropout-47 [-1, 217, 3072] 0\n"," Linear-48 [-1, 217, 768] 2,360,064\n"," Dropout-49 [-1, 217, 768] 0\n"," Mlp-50 [-1, 217, 768] 0\n"," Identity-51 [-1, 217, 768] 0\n"," Block-52 [-1, 217, 768] 0\n"," LayerNorm-53 [-1, 217, 768] 1,536\n"," Linear-54 [-1, 217, 2304] 1,771,776\n"," Dropout-55 [-1, 12, 217, 217] 0\n"," Linear-56 [-1, 217, 768] 590,592\n"," Dropout-57 [-1, 217, 768] 0\n"," Attention-58 [-1, 217, 768] 0\n"," Identity-59 [-1, 217, 768] 0\n"," LayerNorm-60 [-1, 217, 768] 1,536\n"," Linear-61 [-1, 217, 3072] 2,362,368\n"," GELU-62 [-1, 217, 3072] 0\n"," Dropout-63 [-1, 217, 3072] 0\n"," Linear-64 [-1, 217, 768] 2,360,064\n"," Dropout-65 [-1, 217, 768] 0\n"," Mlp-66 [-1, 217, 768] 0\n"," Identity-67 [-1, 217, 768] 0\n"," Block-68 [-1, 217, 768] 0\n"," LayerNorm-69 [-1, 217, 768] 1,536\n"," Linear-70 [-1, 217, 2304] 1,771,776\n"," Dropout-71 [-1, 12, 217, 217] 0\n"," Linear-72 [-1, 217, 768] 590,592\n"," Dropout-73 [-1, 217, 768] 0\n"," Attention-74 [-1, 217, 768] 0\n"," Identity-75 [-1, 217, 768] 0\n"," LayerNorm-76 [-1, 217, 768] 1,536\n"," Linear-77 [-1, 217, 3072] 2,362,368\n"," GELU-78 [-1, 217, 3072] 0\n"," Dropout-79 [-1, 217, 3072] 0\n"," Linear-80 [-1, 217, 768] 2,360,064\n"," Dropout-81 [-1, 217, 768] 0\n"," Mlp-82 [-1, 217, 768] 0\n"," Identity-83 [-1, 217, 768] 0\n"," Block-84 [-1, 217, 768] 0\n"," LayerNorm-85 [-1, 217, 768] 1,536\n"," Linear-86 [-1, 217, 2304] 1,771,776\n"," Dropout-87 [-1, 12, 217, 217] 0\n"," Linear-88 [-1, 217, 768] 590,592\n"," Dropout-89 [-1, 217, 768] 0\n"," Attention-90 [-1, 217, 768] 0\n"," Identity-91 [-1, 217, 768] 0\n"," LayerNorm-92 [-1, 217, 768] 1,536\n"," Linear-93 [-1, 217, 3072] 2,362,368\n"," GELU-94 [-1, 217, 3072] 0\n"," Dropout-95 [-1, 217, 3072] 0\n"," Linear-96 [-1, 217, 768] 2,360,064\n"," Dropout-97 [-1, 217, 768] 0\n"," Mlp-98 [-1, 217, 768] 0\n"," Identity-99 [-1, 217, 768] 0\n"," Block-100 [-1, 217, 768] 0\n"," LayerNorm-101 [-1, 217, 768] 1,536\n"," Linear-102 [-1, 217, 2304] 1,771,776\n"," Dropout-103 [-1, 12, 217, 217] 0\n"," Linear-104 [-1, 217, 768] 590,592\n"," Dropout-105 [-1, 217, 768] 0\n"," Attention-106 [-1, 217, 768] 0\n"," Identity-107 [-1, 217, 768] 0\n"," LayerNorm-108 [-1, 217, 768] 1,536\n"," Linear-109 [-1, 217, 3072] 2,362,368\n"," GELU-110 [-1, 217, 3072] 0\n"," Dropout-111 [-1, 217, 3072] 0\n"," Linear-112 [-1, 217, 768] 2,360,064\n"," Dropout-113 [-1, 217, 768] 0\n"," Mlp-114 [-1, 217, 768] 0\n"," Identity-115 [-1, 217, 768] 0\n"," Block-116 [-1, 217, 768] 0\n"," LayerNorm-117 [-1, 217, 768] 1,536\n"," Linear-118 [-1, 217, 2304] 1,771,776\n"," Dropout-119 [-1, 12, 217, 217] 0\n"," Linear-120 [-1, 217, 768] 590,592\n"," Dropout-121 [-1, 217, 768] 0\n"," Attention-122 [-1, 217, 768] 0\n"," Identity-123 [-1, 217, 768] 0\n"," LayerNorm-124 [-1, 217, 768] 1,536\n"," Linear-125 [-1, 217, 3072] 2,362,368\n"," GELU-126 [-1, 217, 3072] 0\n"," Dropout-127 [-1, 217, 3072] 0\n"," Linear-128 [-1, 217, 768] 2,360,064\n"," Dropout-129 [-1, 217, 768] 0\n"," Mlp-130 [-1, 217, 768] 0\n"," Identity-131 [-1, 217, 768] 0\n"," Block-132 [-1, 217, 768] 0\n"," LayerNorm-133 [-1, 217, 768] 1,536\n"," Linear-134 [-1, 217, 2304] 1,771,776\n"," Dropout-135 [-1, 12, 217, 217] 0\n"," Linear-136 [-1, 217, 768] 590,592\n"," Dropout-137 [-1, 217, 768] 0\n"," Attention-138 [-1, 217, 768] 0\n"," Identity-139 [-1, 217, 768] 0\n"," LayerNorm-140 [-1, 217, 768] 1,536\n"," Linear-141 [-1, 217, 3072] 2,362,368\n"," GELU-142 [-1, 217, 3072] 0\n"," Dropout-143 [-1, 217, 3072] 0\n"," Linear-144 [-1, 217, 768] 2,360,064\n"," Dropout-145 [-1, 217, 768] 0\n"," Mlp-146 [-1, 217, 768] 0\n"," Identity-147 [-1, 217, 768] 0\n"," Block-148 [-1, 217, 768] 0\n"," LayerNorm-149 [-1, 217, 768] 1,536\n"," Linear-150 [-1, 217, 2304] 1,771,776\n"," Dropout-151 [-1, 12, 217, 217] 0\n"," Linear-152 [-1, 217, 768] 590,592\n"," Dropout-153 [-1, 217, 768] 0\n"," Attention-154 [-1, 217, 768] 0\n"," Identity-155 [-1, 217, 768] 0\n"," LayerNorm-156 [-1, 217, 768] 1,536\n"," Linear-157 [-1, 217, 3072] 2,362,368\n"," GELU-158 [-1, 217, 3072] 0\n"," Dropout-159 [-1, 217, 3072] 0\n"," Linear-160 [-1, 217, 768] 2,360,064\n"," Dropout-161 [-1, 217, 768] 0\n"," Mlp-162 [-1, 217, 768] 0\n"," Identity-163 [-1, 217, 768] 0\n"," Block-164 [-1, 217, 768] 0\n"," LayerNorm-165 [-1, 217, 768] 1,536\n"," Linear-166 [-1, 217, 2304] 1,771,776\n"," Dropout-167 [-1, 12, 217, 217] 0\n"," Linear-168 [-1, 217, 768] 590,592\n"," Dropout-169 [-1, 217, 768] 0\n"," Attention-170 [-1, 217, 768] 0\n"," Identity-171 [-1, 217, 768] 0\n"," LayerNorm-172 [-1, 217, 768] 1,536\n"," Linear-173 [-1, 217, 3072] 2,362,368\n"," GELU-174 [-1, 217, 3072] 0\n"," Dropout-175 [-1, 217, 3072] 0\n"," Linear-176 [-1, 217, 768] 2,360,064\n"," Dropout-177 [-1, 217, 768] 0\n"," Mlp-178 [-1, 217, 768] 0\n"," Identity-179 [-1, 217, 768] 0\n"," Block-180 [-1, 217, 768] 0\n"," LayerNorm-181 [-1, 217, 768] 1,536\n"," Linear-182 [-1, 217, 2304] 1,771,776\n"," Dropout-183 [-1, 12, 217, 217] 0\n"," Linear-184 [-1, 217, 768] 590,592\n"," Dropout-185 [-1, 217, 768] 0\n"," Attention-186 [-1, 217, 768] 0\n"," Identity-187 [-1, 217, 768] 0\n"," LayerNorm-188 [-1, 217, 768] 1,536\n"," Linear-189 [-1, 217, 3072] 2,362,368\n"," GELU-190 [-1, 217, 3072] 0\n"," Dropout-191 [-1, 217, 3072] 0\n"," Linear-192 [-1, 217, 768] 2,360,064\n"," Dropout-193 [-1, 217, 768] 0\n"," Mlp-194 [-1, 217, 768] 0\n"," Identity-195 [-1, 217, 768] 0\n"," Block-196 [-1, 217, 768] 0\n"," LayerNorm-197 [-1, 197, 768] 1,536\n"," Identity-198 [-1, 768] 0\n"," Linear-199 [-1, 2] 1,538\n","================================================================\n","Total params: 85,648,130\n","Trainable params: 1,538\n","Non-trainable params: 85,646,592\n","----------------------------------------------------------------\n","Input size (MB): 0.57\n","Forward/backward pass size (MB): 454.20\n","Params size (MB): 326.72\n","Estimated Total Size (MB): 781.49\n","----------------------------------------------------------------\n","model : ViT_base_timm_PromptDeep_20_401_lf15_finetuning_warwick_CLS\n","no valid counterparts augmentation selected\n","Epoch 1/50\n","----------\n","\n","Epoch: 1 train \n","Loss: 0.7798 Acc: 55.0725\n","benign precision: 55.5556 recall: 51.7241\n","benign sensitivity: 51.7241 specificity: 65.7143\n","benign FPR: 34.2857 NPV: 62.1622\n","benign TP: 15.0\n","benign TN: 23.0\n","benign FP: 12.0\n","benign FN: 14.0\n","malignant precision: 62.1622 recall: 65.7143\n","malignant sensitivity: 65.7143 specificity: 51.7241\n","malignant FPR: 48.2759 NPV: 55.5556\n","malignant TP: 23.0\n","malignant TN: 15.0\n","malignant FP: 14.0\n","malignant FN: 12.0\n","\n","\n","\n","Epoch: 1 val \n","Loss: 0.6996 Acc: 62.5000\n","benign precision: 60.0000 recall: 42.8571\n","benign sensitivity: 42.8571 specificity: 77.7778\n","benign FPR: 22.2222 NPV: 63.6364\n","benign TP: 3.0\n","benign TN: 7.0\n","benign FP: 2.0\n","benign FN: 4.0\n","malignant precision: 63.6364 recall: 77.7778\n","malignant sensitivity: 77.7778 specificity: 42.8571\n","malignant FPR: 57.1429 NPV: 60.0000\n","malignant TP: 7.0\n","malignant TN: 3.0\n","malignant FP: 4.0\n","malignant FN: 2.0\n","\n","\n","\n","Epoch 2/50\n","----------\n","\n","Epoch: 2 train \n","Loss: 0.5416 Acc: 62.3188\n","benign precision: 60.8696 recall: 53.8462\n","benign sensitivity: 53.8462 specificity: 76.3158\n","benign FPR: 23.6842 NPV: 70.7317\n","benign TP: 14.0\n","benign TN: 29.0\n","benign FP: 9.0\n","benign FN: 12.0\n","malignant precision: 70.7317 recall: 76.3158\n","malignant sensitivity: 76.3158 specificity: 53.8462\n","malignant FPR: 46.1538 NPV: 60.8696\n","malignant TP: 29.0\n","malignant TN: 14.0\n","malignant FP: 12.0\n","malignant FN: 9.0\n","\n","\n","\n","Epoch: 2 val \n","Loss: 0.5944 Acc: 75.0000\n","benign precision: 80.0000 recall: 57.1429\n","benign sensitivity: 57.1429 specificity: 88.8889\n","benign FPR: 11.1111 NPV: 72.7273\n","benign TP: 4.0\n","benign TN: 8.0\n","benign FP: 1.0\n","benign FN: 3.0\n","malignant precision: 72.7273 recall: 88.8889\n","malignant sensitivity: 88.8889 specificity: 57.1429\n","malignant FPR: 42.8571 NPV: 80.0000\n","malignant TP: 8.0\n","malignant TN: 4.0\n","malignant FP: 3.0\n","malignant FN: 1.0\n","\n","\n","\n","Epoch 3/50\n","----------\n","\n","Epoch: 3 train \n","Loss: 0.4691 Acc: 71.0145\n","benign precision: 78.9474 recall: 57.6923\n","benign sensitivity: 57.6923 specificity: 89.4737\n","benign FPR: 10.5263 NPV: 75.5556\n","benign TP: 15.0\n","benign TN: 34.0\n","benign FP: 4.0\n","benign FN: 11.0\n","malignant precision: 75.5556 recall: 89.4737\n","malignant sensitivity: 89.4737 specificity: 57.6923\n","malignant FPR: 42.3077 NPV: 78.9474\n","malignant TP: 34.0\n","malignant TN: 15.0\n","malignant FP: 11.0\n","malignant FN: 4.0\n","\n","\n","\n","Epoch: 3 val \n","Loss: 0.5269 Acc: 81.2500\n","benign precision: 100.0000 recall: 57.1429\n","benign sensitivity: 57.1429 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 75.0000\n","benign TP: 4.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 3.0\n","malignant precision: 75.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 57.1429\n","malignant FPR: 42.8571 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 4.0\n","malignant FP: 3.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 4/50\n","----------\n","\n","Epoch: 4 train \n","Loss: 0.3390 Acc: 81.1594\n","benign precision: 88.8889 recall: 82.7586\n","benign sensitivity: 82.7586 specificity: 91.4286\n","benign FPR: 8.5714 NPV: 86.4865\n","benign TP: 24.0\n","benign TN: 32.0\n","benign FP: 3.0\n","benign FN: 5.0\n","malignant precision: 86.4865 recall: 91.4286\n","malignant sensitivity: 91.4286 specificity: 82.7586\n","malignant FPR: 17.2414 NPV: 88.8889\n","malignant TP: 32.0\n","malignant TN: 24.0\n","malignant FP: 5.0\n","malignant FN: 3.0\n","\n","\n","\n","Epoch: 4 val \n","Loss: 0.4431 Acc: 81.2500\n","benign precision: 100.0000 recall: 57.1429\n","benign sensitivity: 57.1429 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 75.0000\n","benign TP: 4.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 3.0\n","malignant precision: 75.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 57.1429\n","malignant FPR: 42.8571 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 4.0\n","malignant FP: 3.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 5/50\n","----------\n","\n","Epoch: 5 train \n","Loss: 0.2642 Acc: 84.0580\n","benign precision: 88.8889 recall: 88.8889\n","benign sensitivity: 88.8889 specificity: 91.8919\n","benign FPR: 8.1081 NPV: 91.8919\n","benign TP: 24.0\n","benign TN: 34.0\n","benign FP: 3.0\n","benign FN: 3.0\n","malignant precision: 91.8919 recall: 91.8919\n","malignant sensitivity: 91.8919 specificity: 88.8889\n","malignant FPR: 11.1111 NPV: 88.8889\n","malignant TP: 34.0\n","malignant TN: 24.0\n","malignant FP: 3.0\n","malignant FN: 3.0\n","\n","\n","\n","Epoch: 5 val \n","Loss: 0.4339 Acc: 81.2500\n","benign precision: 100.0000 recall: 57.1429\n","benign sensitivity: 57.1429 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 75.0000\n","benign TP: 4.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 3.0\n","malignant precision: 75.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 57.1429\n","malignant FPR: 42.8571 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 4.0\n","malignant FP: 3.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 6/50\n","----------\n","\n","Epoch: 6 train \n","Loss: 0.2274 Acc: 88.4058\n","benign precision: 100.0000 recall: 88.4615\n","benign sensitivity: 88.4615 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 92.6829\n","benign TP: 23.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 3.0\n","malignant precision: 92.6829 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 88.4615\n","malignant FPR: 11.5385 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 23.0\n","malignant FP: 3.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 6 val \n","Loss: 0.3520 Acc: 81.2500\n","benign precision: 100.0000 recall: 57.1429\n","benign sensitivity: 57.1429 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 75.0000\n","benign TP: 4.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 3.0\n","malignant precision: 75.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 57.1429\n","malignant FPR: 42.8571 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 4.0\n","malignant FP: 3.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 7/50\n","----------\n","\n","Epoch: 7 train \n","Loss: 0.2065 Acc: 88.4058\n","benign precision: 96.1538 recall: 92.5926\n","benign sensitivity: 92.5926 specificity: 97.2973\n","benign FPR: 2.7027 NPV: 94.7368\n","benign TP: 25.0\n","benign TN: 36.0\n","benign FP: 1.0\n","benign FN: 2.0\n","malignant precision: 94.7368 recall: 97.2973\n","malignant sensitivity: 97.2973 specificity: 92.5926\n","malignant FPR: 7.4074 NPV: 96.1538\n","malignant TP: 36.0\n","malignant TN: 25.0\n","malignant FP: 2.0\n","malignant FN: 1.0\n","\n","\n","\n","Epoch: 7 val \n","Loss: 0.2767 Acc: 81.2500\n","benign precision: 100.0000 recall: 57.1429\n","benign sensitivity: 57.1429 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 75.0000\n","benign TP: 4.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 3.0\n","malignant precision: 75.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 57.1429\n","malignant FPR: 42.8571 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 4.0\n","malignant FP: 3.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 8/50\n","----------\n","\n","Epoch: 8 train \n","Loss: 0.1894 Acc: 88.4058\n","benign precision: 100.0000 recall: 90.0000\n","benign sensitivity: 90.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 91.8919\n","benign TP: 27.0\n","benign TN: 34.0\n","benign FP: 0.0\n","benign FN: 3.0\n","malignant precision: 91.8919 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 90.0000\n","malignant FPR: 10.0000 NPV: 100.0000\n","malignant TP: 34.0\n","malignant TN: 27.0\n","malignant FP: 3.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 8 val \n","Loss: 0.3026 Acc: 81.2500\n","benign precision: 100.0000 recall: 57.1429\n","benign sensitivity: 57.1429 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 75.0000\n","benign TP: 4.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 3.0\n","malignant precision: 75.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 57.1429\n","malignant FPR: 42.8571 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 4.0\n","malignant FP: 3.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 9/50\n","----------\n","\n","Epoch: 9 train \n","Loss: 0.1697 Acc: 88.4058\n","benign precision: 96.1538 recall: 92.5926\n","benign sensitivity: 92.5926 specificity: 97.2973\n","benign FPR: 2.7027 NPV: 94.7368\n","benign TP: 25.0\n","benign TN: 36.0\n","benign FP: 1.0\n","benign FN: 2.0\n","malignant precision: 94.7368 recall: 97.2973\n","malignant sensitivity: 97.2973 specificity: 92.5926\n","malignant FPR: 7.4074 NPV: 96.1538\n","malignant TP: 36.0\n","malignant TN: 25.0\n","malignant FP: 2.0\n","malignant FN: 1.0\n","\n","\n","\n","Epoch: 9 val \n","Loss: 0.1931 Acc: 93.7500\n","benign precision: 100.0000 recall: 85.7143\n","benign sensitivity: 85.7143 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 90.0000\n","benign TP: 6.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 1.0\n","malignant precision: 90.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 85.7143\n","malignant FPR: 14.2857 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 6.0\n","malignant FP: 1.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 10/50\n","----------\n","\n","Epoch: 10 train \n","Loss: 0.1084 Acc: 89.8551\n","benign precision: 96.4286 recall: 96.4286\n","benign sensitivity: 96.4286 specificity: 97.2222\n","benign FPR: 2.7778 NPV: 97.2222\n","benign TP: 27.0\n","benign TN: 35.0\n","benign FP: 1.0\n","benign FN: 1.0\n","malignant precision: 97.2222 recall: 97.2222\n","malignant sensitivity: 97.2222 specificity: 96.4286\n","malignant FPR: 3.5714 NPV: 96.4286\n","malignant TP: 35.0\n","malignant TN: 27.0\n","malignant FP: 1.0\n","malignant FN: 1.0\n","\n","\n","\n","Epoch: 10 val \n","Loss: 0.2491 Acc: 87.5000\n","benign precision: 100.0000 recall: 71.4286\n","benign sensitivity: 71.4286 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 81.8182\n","benign TP: 5.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 2.0\n","malignant precision: 81.8182 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 71.4286\n","malignant FPR: 28.5714 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 5.0\n","malignant FP: 2.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 11/50\n","----------\n","\n","Epoch: 11 train \n","Loss: 0.0966 Acc: 91.3043\n","benign precision: 100.0000 recall: 96.2963\n","benign sensitivity: 96.2963 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 97.3684\n","benign TP: 26.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 1.0\n","malignant precision: 97.3684 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 96.2963\n","malignant FPR: 3.7037 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 26.0\n","malignant FP: 1.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 11 val \n","Loss: 0.1859 Acc: 93.7500\n","benign precision: 100.0000 recall: 85.7143\n","benign sensitivity: 85.7143 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 90.0000\n","benign TP: 6.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 1.0\n","malignant precision: 90.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 85.7143\n","malignant FPR: 14.2857 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 6.0\n","malignant FP: 1.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 12/50\n","----------\n","\n","Epoch: 12 train \n","Loss: 0.0863 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 12 val \n","Loss: 0.1140 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 13/50\n","----------\n","\n","Epoch: 13 train \n","Loss: 0.0857 Acc: 91.3043\n","benign precision: 100.0000 recall: 96.2963\n","benign sensitivity: 96.2963 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 97.3684\n","benign TP: 26.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 1.0\n","malignant precision: 97.3684 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 96.2963\n","malignant FPR: 3.7037 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 26.0\n","malignant FP: 1.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 13 val \n","Loss: 0.1308 Acc: 93.7500\n","benign precision: 100.0000 recall: 85.7143\n","benign sensitivity: 85.7143 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 90.0000\n","benign TP: 6.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 1.0\n","malignant precision: 90.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 85.7143\n","malignant FPR: 14.2857 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 6.0\n","malignant FP: 1.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 14/50\n","----------\n","\n","Epoch: 14 train \n","Loss: 0.0749 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 14 val \n","Loss: 0.1053 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 15/50\n","----------\n","\n","Epoch: 15 train \n","Loss: 0.0503 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 26.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 26.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 15 val \n","Loss: 0.1226 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 16/50\n","----------\n","\n","Epoch: 16 train \n","Loss: 0.0510 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 16 val \n","Loss: 0.0832 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 17/50\n","----------\n","\n","Epoch: 17 train \n","Loss: 0.0495 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 17 val \n","Loss: 0.0738 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 18/50\n","----------\n","\n","Epoch: 18 train \n","Loss: 0.0476 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 18 val \n","Loss: 0.0692 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 19/50\n","----------\n","\n","Epoch: 19 train \n","Loss: 0.0376 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 19 val \n","Loss: 0.0681 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 20/50\n","----------\n","\n","Epoch: 20 train \n","Loss: 0.0319 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 20 val \n","Loss: 0.0694 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 21/50\n","----------\n","\n","Epoch: 21 train \n","Loss: 0.0264 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 21 val \n","Loss: 0.0643 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 22/50\n","----------\n","\n","Epoch: 22 train \n","Loss: 0.0258 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 22 val \n","Loss: 0.0568 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 23/50\n","----------\n","\n","Epoch: 23 train \n","Loss: 0.0238 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 23 val \n","Loss: 0.0567 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 24/50\n","----------\n","\n","Epoch: 24 train \n","Loss: 0.0212 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 24 val \n","Loss: 0.0572 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 25/50\n","----------\n","\n","Epoch: 25 train \n","Loss: 0.0195 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 25 val \n","Loss: 0.0650 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 26/50\n","----------\n","\n","Epoch: 26 train \n","Loss: 0.0256 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 30.0\n","benign TN: 34.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 34.0\n","malignant TN: 30.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 26 val \n","Loss: 0.0390 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 27/50\n","----------\n","\n","Epoch: 27 train \n","Loss: 0.0197 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 26.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 26.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 27 val \n","Loss: 0.0506 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 28/50\n","----------\n","\n","Epoch: 28 train \n","Loss: 0.0260 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 28 val \n","Loss: 0.0452 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 29/50\n","----------\n","\n","Epoch: 29 train \n","Loss: 0.0164 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 29 val \n","Loss: 0.0368 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 30/50\n","----------\n","\n","Epoch: 30 train \n","Loss: 0.0166 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 26.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 26.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 30 val \n","Loss: 0.0475 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 31/50\n","----------\n","\n","Epoch: 31 train \n","Loss: 0.0150 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 26.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 26.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 31 val \n","Loss: 0.0456 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 32/50\n","----------\n","\n","Epoch: 32 train \n","Loss: 0.0183 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 32 val \n","Loss: 0.0325 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 33/50\n","----------\n","\n","Epoch: 33 train \n","Loss: 0.0151 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 33 val \n","Loss: 0.0345 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 34/50\n","----------\n","\n","Epoch: 34 train \n","Loss: 0.0171 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 34 val \n","Loss: 0.0338 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 35/50\n","----------\n","\n","Epoch: 35 train \n","Loss: 0.0119 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 26.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 26.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 35 val \n","Loss: 0.0363 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 36/50\n","----------\n","\n","Epoch: 36 train \n","Loss: 0.0127 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 36 val \n","Loss: 0.0330 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 37/50\n","----------\n","\n","Epoch: 37 train \n","Loss: 0.0155 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 37 val \n","Loss: 0.0273 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 38/50\n","----------\n","\n","Epoch: 38 train \n","Loss: 0.0097 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 38 val \n","Loss: 0.0256 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 39/50\n","----------\n","\n","Epoch: 39 train \n","Loss: 0.0122 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 39 val \n","Loss: 0.0268 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 40/50\n","----------\n","\n","Epoch: 40 train \n","Loss: 0.0106 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 40 val \n","Loss: 0.0289 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 41/50\n","----------\n","\n","Epoch: 41 train \n","Loss: 0.0095 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 41 val \n","Loss: 0.0293 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 42/50\n","----------\n","\n","Epoch: 42 train \n","Loss: 0.0093 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 42 val \n","Loss: 0.0299 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 43/50\n","----------\n","\n","Epoch: 43 train \n","Loss: 0.0066 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 43 val \n","Loss: 0.0290 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 44/50\n","----------\n","\n","Epoch: 44 train \n","Loss: 0.0105 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 44 val \n","Loss: 0.0268 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 45/50\n","----------\n","\n","Epoch: 45 train \n","Loss: 0.0096 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 45 val \n","Loss: 0.0287 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 46/50\n","----------\n","\n","Epoch: 46 train \n","Loss: 0.0093 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 46 val \n","Loss: 0.0282 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 47/50\n","----------\n","\n","Epoch: 47 train \n","Loss: 0.0103 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 47 val \n","Loss: 0.0255 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 48/50\n","----------\n","\n","Epoch: 48 train \n","Loss: 0.0100 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 26.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 26.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 48 val \n","Loss: 0.0256 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 49/50\n","----------\n","\n","Epoch: 49 train \n","Loss: 0.0074 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 49 val \n","Loss: 0.0256 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 50/50\n","----------\n","\n","Epoch: 50 train \n","Loss: 0.0099 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 50 val \n","Loss: 0.0294 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Training complete in 0m 46s\n","Best epoch idx: 50\n","Best epoch train Acc: 92.753623\n","Best epoch val Acc: 100.000000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","model trained by GPU (idx:0) has been saved at /home/Pathology_Experiment/saved_models/CLS_ViT_base_timm_PromptDeep_20_401_lf15_finetuning_warwick_CLS.pth\n"]}],"source":["!python Train.py --edge_size 224 --data_augmentation_mode 3 --lr 1e-05 --lrf 0.15 --enable_tensorboard --model_idx ViT_base_timm_PromptDeep_20_401_lf15_finetuning_warwick_CLS --PromptTuning Deep --dataroot /data/Pathology_Experiment/dataset/warwick_CLS --draw_root /home/Pathology_Experiment/runs/VPT_finetuning_with_timm --model_path /home/Pathology_Experiment/saved_models"]},{"cell_type":"markdown","metadata":{"id":"q2mnI-UW2YFS"},"source":["Test"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"98WKJutR2QtY"},"outputs":[{"name":"stdout","output_type":"stream","text":["class_names: ['benign', 'malignant']\n","base_state_dict of timm\n","Test the PromptTuning of ViT_base_timm_PromptDeep_20_401_lf15_finetuning_warwick_CLS\n","Prompt VPT type: Deep\n","in prompt model building, timm ViT loaded for base_state_dict\n","model forward cheacked\n","model is ready now!\n","prompt head match\n","model loaded\n","model : ViT_base_timm_PromptDeep_20_401_lf15_finetuning_warwick_CLS\n","*********************************setting*************************************\n","Namespace(model_idx='ViT_base_timm_PromptDeep_20_401_lf15_finetuning_warwick_CLS', drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, cls_token_off=False, pos_embedding_off=False, att_module='SimAM', gpu_idx=0, dataroot='/data/Pathology_Experiment/dataset/warwick_CLS', model_path='/home/Pathology_Experiment/saved_models', draw_root='/home/Pathology_Experiment/runs/VPT_finetuning_with_timm', model_path_by_hand=None, paint=True, enable_tensorboard=False, enable_attention_check=False, enable_visualize_check=False, data_augmentation_mode=3, PromptTuning='Deep', Prompt_Token_num=20, PromptUnFreeze=False, Pre_Trained_model_path=None, num_classes=0, edge_size=224, batch_size=1, check_minibatch=None)\n","Epoch: Test\n","----------\n","Epoch: test test index of 4 minibatch: 1 time used: 2.232135534286499\n","minibatch AVG loss: 0.26816506078466773\n","Epoch: test test index of 4 minibatch: 2 time used: 0.05264568328857422\n","minibatch AVG loss: 0.05344134452752769\n","Epoch: test test index of 4 minibatch: 3 time used: 0.05270862579345703\n","minibatch AVG loss: 0.0027116276614833623\n","Epoch: test test index of 4 minibatch: 4 time used: 0.052893638610839844\n","minibatch AVG loss: 0.4669480860720796\n","Epoch: test test index of 4 minibatch: 5 time used: 0.2150881290435791\n","minibatch AVG loss: 0.6584409326897003\n","Epoch: test test index of 4 minibatch: 6 time used: 0.07256197929382324\n","minibatch AVG loss: 0.14450758622115245\n","Epoch: test test index of 4 minibatch: 7 time used: 0.06723260879516602\n","minibatch AVG loss: 0.20112267805961892\n","Epoch: test test index of 4 minibatch: 8 time used: 0.0691368579864502\n","minibatch AVG loss: 0.014065294861211441\n","Epoch: test test index of 4 minibatch: 9 time used: 0.06417989730834961\n","minibatch AVG loss: 0.01810028008185327\n","Epoch: test test index of 4 minibatch: 10 time used: 0.0646204948425293\n","minibatch AVG loss: 0.10914140564273112\n","Epoch: test test index of 4 minibatch: 11 time used: 0.052018165588378906\n","minibatch AVG loss: 0.025396305602043867\n","Epoch: test test index of 4 minibatch: 12 time used: 0.0520014762878418\n","minibatch AVG loss: 0.13099558325484395\n","Epoch: test test index of 4 minibatch: 13 time used: 0.05219769477844238\n","minibatch AVG loss: 0.010371054049755912\n","Epoch: test test index of 4 minibatch: 14 time used: 0.05182027816772461\n","minibatch AVG loss: 0.010890420817304403\n","Epoch: test test index of 4 minibatch: 15 time used: 0.05169200897216797\n","minibatch AVG loss: 0.005727577343350276\n","Epoch: test test index of 4 minibatch: 16 time used: 0.0532376766204834\n","minibatch AVG loss: 0.001175819052150473\n","Epoch: test test index of 4 minibatch: 17 time used: 0.05185651779174805\n","minibatch AVG loss: 0.015627381930244155\n","Epoch: test test index of 4 minibatch: 18 time used: 0.05151867866516113\n","minibatch AVG loss: 0.011185463983565569\n","Epoch: test test index of 4 minibatch: 19 time used: 0.05146026611328125\n","minibatch AVG loss: 0.0034086240502801957\n","Epoch: test test index of 4 minibatch: 20 time used: 0.05474138259887695\n","minibatch AVG loss: 0.11933412203507032\n","\n","Epoch: test \n","Loss: 0.1135 Acc: 95.0000\n","benign precision: 100.0000 recall: 89.1892\n","benign sensitivity: 89.1892 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 91.4894\n","benign TP: 33.0\n","benign TN: 43.0\n","benign FP: 0.0\n","benign FN: 4.0\n","malignant precision: 91.4894 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 89.1892\n","malignant FPR: 10.8108 NPV: 100.0000\n","malignant TP: 43.0\n","malignant TN: 33.0\n","malignant FP: 4.0\n","malignant FN: 0.0\n","\n","\n","Testing complete in 0m 4s\n"]}],"source":["!python Test.py --edge_size 224 --data_augmentation_mode 3 --model_idx ViT_base_timm_PromptDeep_20_401_lf15_finetuning_warwick_CLS --PromptTuning Deep --dataroot /data/Pathology_Experiment/dataset/warwick_CLS --draw_root /home/Pathology_Experiment/runs/VPT_finetuning_with_timm --model_path /home/Pathology_Experiment/saved_models"]},{"cell_type":"markdown","metadata":{"id":"P5rRq1HhbSiC"},"source":["## Finetuning with PuzzleTuning Prompt\n","VPT + finetuning (with timm weight \u0026 PuzzleTuning Prompt)"]},{"cell_type":"markdown","metadata":{"id":"viVkMdz5bVQ_"},"source":["Train"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"UUYpNJ74bV4n"},"outputs":[{"name":"stdout","output_type":"stream","text":["class_names: ['benign', 'malignant']\n","*********************************setting*************************************\n","Namespace(model_idx='ViT_base_timm_PuzzleTuning_SAE_promptstate_PromptDeep_20_401_lf15_finetuning_warwick_CLS', drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, cls_token_off=False, pos_embedding_off=False, att_module='SimAM', backbone_PT_off=False, gpu_idx=-1, dataroot='/data/Pathology_Experiment/dataset/warwick_CLS', model_path='/home/Pathology_Experiment/saved_models', draw_root='/home/Pathology_Experiment/runs/SAE-timm-start_promptstate_VPT_finetuning_with_timm', paint=True, enable_tensorboard=True, enable_attention_check=False, enable_visualize_check=False, PromptTuning='Deep', Prompt_Token_num=20, PromptUnFreeze=True, linearprobing=False, Pre_Trained_model_path=None, Prompt_state_path='/home/Pathology_Experiment/saved_models/ViT_b16_224_timm_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_promptstate.pth', enable_sam=False, augmentation_name=None, ratio_strategy=None, patch_strategy=None, loss_drive_threshold=4.0, fix_position_ratio=0.5, fix_patch_size=None, patch_size_jump=None, num_classes=0, edge_size=224, data_augmentation_mode=3, batch_size=8, num_epochs=50, intake_epochs=0, lr=1e-05, lrf=0.15, opt_name='Adam', check_minibatch=None, num_workers=2)\n","we dont have more GPU idx here, try to use gpu_idx=0\n","PromptTuning of ViT_base_timm_PuzzleTuning_SAE_promptstate_PromptDeep_20_401_lf15_finetuning_warwick_CLS\n","Prompt VPT type: Deep\n","backbone base_state_dict of timm\n","prompting with prompt_state at: /home/Pathology_Experiment/saved_models/ViT_b16_224_timm_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_promptstate.pth\n","in prompt model building, timm ViT loaded for base_state_dict\n","head not match, so skip head\n","in prompt model building, a .pth prompt_state_dict loaded\n","model forward cheacked\n","model is ready now!\n","prompt tuning with all parameaters un-freezed\n","GPU: 0\n","----------------------------------------------------------------\n"," Layer (type) Output Shape Param #\n","================================================================\n"," Conv2d-1 [-1, 768, 14, 14] 590,592\n"," Identity-2 [-1, 196, 768] 0\n"," PatchEmbed-3 [-1, 196, 768] 0\n"," Dropout-4 [-1, 197, 768] 0\n"," LayerNorm-5 [-1, 217, 768] 1,536\n"," Linear-6 [-1, 217, 2304] 1,771,776\n"," Dropout-7 [-1, 12, 217, 217] 0\n"," Linear-8 [-1, 217, 768] 590,592\n"," Dropout-9 [-1, 217, 768] 0\n"," Attention-10 [-1, 217, 768] 0\n"," Identity-11 [-1, 217, 768] 0\n"," LayerNorm-12 [-1, 217, 768] 1,536\n"," Linear-13 [-1, 217, 3072] 2,362,368\n"," GELU-14 [-1, 217, 3072] 0\n"," Dropout-15 [-1, 217, 3072] 0\n"," Linear-16 [-1, 217, 768] 2,360,064\n"," Dropout-17 [-1, 217, 768] 0\n"," Mlp-18 [-1, 217, 768] 0\n"," Identity-19 [-1, 217, 768] 0\n"," Block-20 [-1, 217, 768] 0\n"," LayerNorm-21 [-1, 217, 768] 1,536\n"," Linear-22 [-1, 217, 2304] 1,771,776\n"," Dropout-23 [-1, 12, 217, 217] 0\n"," Linear-24 [-1, 217, 768] 590,592\n"," Dropout-25 [-1, 217, 768] 0\n"," Attention-26 [-1, 217, 768] 0\n"," Identity-27 [-1, 217, 768] 0\n"," LayerNorm-28 [-1, 217, 768] 1,536\n"," Linear-29 [-1, 217, 3072] 2,362,368\n"," GELU-30 [-1, 217, 3072] 0\n"," Dropout-31 [-1, 217, 3072] 0\n"," Linear-32 [-1, 217, 768] 2,360,064\n"," Dropout-33 [-1, 217, 768] 0\n"," Mlp-34 [-1, 217, 768] 0\n"," Identity-35 [-1, 217, 768] 0\n"," Block-36 [-1, 217, 768] 0\n"," LayerNorm-37 [-1, 217, 768] 1,536\n"," Linear-38 [-1, 217, 2304] 1,771,776\n"," Dropout-39 [-1, 12, 217, 217] 0\n"," Linear-40 [-1, 217, 768] 590,592\n"," Dropout-41 [-1, 217, 768] 0\n"," Attention-42 [-1, 217, 768] 0\n"," Identity-43 [-1, 217, 768] 0\n"," LayerNorm-44 [-1, 217, 768] 1,536\n"," Linear-45 [-1, 217, 3072] 2,362,368\n"," GELU-46 [-1, 217, 3072] 0\n"," Dropout-47 [-1, 217, 3072] 0\n"," Linear-48 [-1, 217, 768] 2,360,064\n"," Dropout-49 [-1, 217, 768] 0\n"," Mlp-50 [-1, 217, 768] 0\n"," Identity-51 [-1, 217, 768] 0\n"," Block-52 [-1, 217, 768] 0\n"," LayerNorm-53 [-1, 217, 768] 1,536\n"," Linear-54 [-1, 217, 2304] 1,771,776\n"," Dropout-55 [-1, 12, 217, 217] 0\n"," Linear-56 [-1, 217, 768] 590,592\n"," Dropout-57 [-1, 217, 768] 0\n"," Attention-58 [-1, 217, 768] 0\n"," Identity-59 [-1, 217, 768] 0\n"," LayerNorm-60 [-1, 217, 768] 1,536\n"," Linear-61 [-1, 217, 3072] 2,362,368\n"," GELU-62 [-1, 217, 3072] 0\n"," Dropout-63 [-1, 217, 3072] 0\n"," Linear-64 [-1, 217, 768] 2,360,064\n"," Dropout-65 [-1, 217, 768] 0\n"," Mlp-66 [-1, 217, 768] 0\n"," Identity-67 [-1, 217, 768] 0\n"," Block-68 [-1, 217, 768] 0\n"," LayerNorm-69 [-1, 217, 768] 1,536\n"," Linear-70 [-1, 217, 2304] 1,771,776\n"," Dropout-71 [-1, 12, 217, 217] 0\n"," Linear-72 [-1, 217, 768] 590,592\n"," Dropout-73 [-1, 217, 768] 0\n"," Attention-74 [-1, 217, 768] 0\n"," Identity-75 [-1, 217, 768] 0\n"," LayerNorm-76 [-1, 217, 768] 1,536\n"," Linear-77 [-1, 217, 3072] 2,362,368\n"," GELU-78 [-1, 217, 3072] 0\n"," Dropout-79 [-1, 217, 3072] 0\n"," Linear-80 [-1, 217, 768] 2,360,064\n"," Dropout-81 [-1, 217, 768] 0\n"," Mlp-82 [-1, 217, 768] 0\n"," Identity-83 [-1, 217, 768] 0\n"," Block-84 [-1, 217, 768] 0\n"," LayerNorm-85 [-1, 217, 768] 1,536\n"," Linear-86 [-1, 217, 2304] 1,771,776\n"," Dropout-87 [-1, 12, 217, 217] 0\n"," Linear-88 [-1, 217, 768] 590,592\n"," Dropout-89 [-1, 217, 768] 0\n"," Attention-90 [-1, 217, 768] 0\n"," Identity-91 [-1, 217, 768] 0\n"," LayerNorm-92 [-1, 217, 768] 1,536\n"," Linear-93 [-1, 217, 3072] 2,362,368\n"," GELU-94 [-1, 217, 3072] 0\n"," Dropout-95 [-1, 217, 3072] 0\n"," Linear-96 [-1, 217, 768] 2,360,064\n"," Dropout-97 [-1, 217, 768] 0\n"," Mlp-98 [-1, 217, 768] 0\n"," Identity-99 [-1, 217, 768] 0\n"," Block-100 [-1, 217, 768] 0\n"," LayerNorm-101 [-1, 217, 768] 1,536\n"," Linear-102 [-1, 217, 2304] 1,771,776\n"," Dropout-103 [-1, 12, 217, 217] 0\n"," Linear-104 [-1, 217, 768] 590,592\n"," Dropout-105 [-1, 217, 768] 0\n"," Attention-106 [-1, 217, 768] 0\n"," Identity-107 [-1, 217, 768] 0\n"," LayerNorm-108 [-1, 217, 768] 1,536\n"," Linear-109 [-1, 217, 3072] 2,362,368\n"," GELU-110 [-1, 217, 3072] 0\n"," Dropout-111 [-1, 217, 3072] 0\n"," Linear-112 [-1, 217, 768] 2,360,064\n"," Dropout-113 [-1, 217, 768] 0\n"," Mlp-114 [-1, 217, 768] 0\n"," Identity-115 [-1, 217, 768] 0\n"," Block-116 [-1, 217, 768] 0\n"," LayerNorm-117 [-1, 217, 768] 1,536\n"," Linear-118 [-1, 217, 2304] 1,771,776\n"," Dropout-119 [-1, 12, 217, 217] 0\n"," Linear-120 [-1, 217, 768] 590,592\n"," Dropout-121 [-1, 217, 768] 0\n"," Attention-122 [-1, 217, 768] 0\n"," Identity-123 [-1, 217, 768] 0\n"," LayerNorm-124 [-1, 217, 768] 1,536\n"," Linear-125 [-1, 217, 3072] 2,362,368\n"," GELU-126 [-1, 217, 3072] 0\n"," Dropout-127 [-1, 217, 3072] 0\n"," Linear-128 [-1, 217, 768] 2,360,064\n"," Dropout-129 [-1, 217, 768] 0\n"," Mlp-130 [-1, 217, 768] 0\n"," Identity-131 [-1, 217, 768] 0\n"," Block-132 [-1, 217, 768] 0\n"," LayerNorm-133 [-1, 217, 768] 1,536\n"," Linear-134 [-1, 217, 2304] 1,771,776\n"," Dropout-135 [-1, 12, 217, 217] 0\n"," Linear-136 [-1, 217, 768] 590,592\n"," Dropout-137 [-1, 217, 768] 0\n"," Attention-138 [-1, 217, 768] 0\n"," Identity-139 [-1, 217, 768] 0\n"," LayerNorm-140 [-1, 217, 768] 1,536\n"," Linear-141 [-1, 217, 3072] 2,362,368\n"," GELU-142 [-1, 217, 3072] 0\n"," Dropout-143 [-1, 217, 3072] 0\n"," Linear-144 [-1, 217, 768] 2,360,064\n"," Dropout-145 [-1, 217, 768] 0\n"," Mlp-146 [-1, 217, 768] 0\n"," Identity-147 [-1, 217, 768] 0\n"," Block-148 [-1, 217, 768] 0\n"," LayerNorm-149 [-1, 217, 768] 1,536\n"," Linear-150 [-1, 217, 2304] 1,771,776\n"," Dropout-151 [-1, 12, 217, 217] 0\n"," Linear-152 [-1, 217, 768] 590,592\n"," Dropout-153 [-1, 217, 768] 0\n"," Attention-154 [-1, 217, 768] 0\n"," Identity-155 [-1, 217, 768] 0\n"," LayerNorm-156 [-1, 217, 768] 1,536\n"," Linear-157 [-1, 217, 3072] 2,362,368\n"," GELU-158 [-1, 217, 3072] 0\n"," Dropout-159 [-1, 217, 3072] 0\n"," Linear-160 [-1, 217, 768] 2,360,064\n"," Dropout-161 [-1, 217, 768] 0\n"," Mlp-162 [-1, 217, 768] 0\n"," Identity-163 [-1, 217, 768] 0\n"," Block-164 [-1, 217, 768] 0\n"," LayerNorm-165 [-1, 217, 768] 1,536\n"," Linear-166 [-1, 217, 2304] 1,771,776\n"," Dropout-167 [-1, 12, 217, 217] 0\n"," Linear-168 [-1, 217, 768] 590,592\n"," Dropout-169 [-1, 217, 768] 0\n"," Attention-170 [-1, 217, 768] 0\n"," Identity-171 [-1, 217, 768] 0\n"," LayerNorm-172 [-1, 217, 768] 1,536\n"," Linear-173 [-1, 217, 3072] 2,362,368\n"," GELU-174 [-1, 217, 3072] 0\n"," Dropout-175 [-1, 217, 3072] 0\n"," Linear-176 [-1, 217, 768] 2,360,064\n"," Dropout-177 [-1, 217, 768] 0\n"," Mlp-178 [-1, 217, 768] 0\n"," Identity-179 [-1, 217, 768] 0\n"," Block-180 [-1, 217, 768] 0\n"," LayerNorm-181 [-1, 217, 768] 1,536\n"," Linear-182 [-1, 217, 2304] 1,771,776\n"," Dropout-183 [-1, 12, 217, 217] 0\n"," Linear-184 [-1, 217, 768] 590,592\n"," Dropout-185 [-1, 217, 768] 0\n"," Attention-186 [-1, 217, 768] 0\n"," Identity-187 [-1, 217, 768] 0\n"," LayerNorm-188 [-1, 217, 768] 1,536\n"," Linear-189 [-1, 217, 3072] 2,362,368\n"," GELU-190 [-1, 217, 3072] 0\n"," Dropout-191 [-1, 217, 3072] 0\n"," Linear-192 [-1, 217, 768] 2,360,064\n"," Dropout-193 [-1, 217, 768] 0\n"," Mlp-194 [-1, 217, 768] 0\n"," Identity-195 [-1, 217, 768] 0\n"," Block-196 [-1, 217, 768] 0\n"," LayerNorm-197 [-1, 197, 768] 1,536\n"," Identity-198 [-1, 768] 0\n"," Linear-199 [-1, 2] 1,538\n","================================================================\n","Total params: 85,648,130\n","Trainable params: 85,648,130\n","Non-trainable params: 0\n","----------------------------------------------------------------\n","Input size (MB): 0.57\n","Forward/backward pass size (MB): 454.20\n","Params size (MB): 326.72\n","Estimated Total Size (MB): 781.49\n","----------------------------------------------------------------\n","model : ViT_base_timm_PuzzleTuning_SAE_promptstate_PromptDeep_20_401_lf15_finetuning_warwick_CLS\n","no valid counterparts augmentation selected\n","Epoch 1/50\n","----------\n","\n","Epoch: 1 train \n","Loss: 0.9057 Acc: 47.8261\n","benign precision: 47.5000 recall: 65.5172\n","benign sensitivity: 65.5172 specificity: 40.0000\n","benign FPR: 60.0000 NPV: 58.3333\n","benign TP: 19.0\n","benign TN: 14.0\n","benign FP: 21.0\n","benign FN: 10.0\n","malignant precision: 58.3333 recall: 40.0000\n","malignant sensitivity: 40.0000 specificity: 65.5172\n","malignant FPR: 34.4828 NPV: 47.5000\n","malignant TP: 14.0\n","malignant TN: 19.0\n","malignant FP: 10.0\n","malignant FN: 21.0\n","\n","\n","\n","Epoch: 1 val \n","Loss: 0.6906 Acc: 56.2500\n","benign precision: 0.0000 recall: 0.0000\n","benign sensitivity: 0.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 56.2500\n","benign TP: 0.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 7.0\n","malignant precision: 56.2500 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 0.0000\n","malignant FPR: 100.0000 NPV: 0.0000\n","malignant TP: 9.0\n","malignant TN: 0.0\n","malignant FP: 7.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 2/50\n","----------\n","\n","Epoch: 2 train \n","Loss: 0.7041 Acc: 55.0725\n","benign precision: 0.0000 recall: 0.0000\n","benign sensitivity: 0.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 59.3750\n","benign TP: 0.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 26.0\n","malignant precision: 59.3750 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 0.0000\n","malignant FPR: 100.0000 NPV: 0.0000\n","malignant TP: 38.0\n","malignant TN: 0.0\n","malignant FP: 26.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 2 val \n","Loss: 0.6821 Acc: 56.2500\n","benign precision: 0.0000 recall: 0.0000\n","benign sensitivity: 0.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 56.2500\n","benign TP: 0.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 7.0\n","malignant precision: 56.2500 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 0.0000\n","malignant FPR: 100.0000 NPV: 0.0000\n","malignant TP: 9.0\n","malignant TN: 0.0\n","malignant FP: 7.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 3/50\n","----------\n","\n","Epoch: 3 train \n","Loss: 0.6266 Acc: 55.0725\n","benign precision: 0.0000 recall: 0.0000\n","benign sensitivity: 0.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 59.3750\n","benign TP: 0.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 26.0\n","malignant precision: 59.3750 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 0.0000\n","malignant FPR: 100.0000 NPV: 0.0000\n","malignant TP: 38.0\n","malignant TN: 0.0\n","malignant FP: 26.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 3 val \n","Loss: 0.6837 Acc: 56.2500\n","benign precision: 0.0000 recall: 0.0000\n","benign sensitivity: 0.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 56.2500\n","benign TP: 0.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 7.0\n","malignant precision: 56.2500 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 0.0000\n","malignant FPR: 100.0000 NPV: 0.0000\n","malignant TP: 9.0\n","malignant TN: 0.0\n","malignant FP: 7.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 4/50\n","----------\n","\n","Epoch: 4 train \n","Loss: 0.6440 Acc: 50.7246\n","benign precision: 0.0000 recall: 0.0000\n","benign sensitivity: 0.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 54.6875\n","benign TP: 0.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 29.0\n","malignant precision: 54.6875 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 0.0000\n","malignant FPR: 100.0000 NPV: 0.0000\n","malignant TP: 35.0\n","malignant TN: 0.0\n","malignant FP: 29.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 4 val \n","Loss: 0.6855 Acc: 56.2500\n","benign precision: 0.0000 recall: 0.0000\n","benign sensitivity: 0.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 56.2500\n","benign TP: 0.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 7.0\n","malignant precision: 56.2500 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 0.0000\n","malignant FPR: 100.0000 NPV: 0.0000\n","malignant TP: 9.0\n","malignant TN: 0.0\n","malignant FP: 7.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 5/50\n","----------\n","\n","Epoch: 5 train \n","Loss: 0.6487 Acc: 44.9275\n","benign precision: 36.3636 recall: 29.6296\n","benign sensitivity: 29.6296 specificity: 62.1622\n","benign FPR: 37.8378 NPV: 54.7619\n","benign TP: 8.0\n","benign TN: 23.0\n","benign FP: 14.0\n","benign FN: 19.0\n","malignant precision: 54.7619 recall: 62.1622\n","malignant sensitivity: 62.1622 specificity: 29.6296\n","malignant FPR: 70.3704 NPV: 36.3636\n","malignant TP: 23.0\n","malignant TN: 8.0\n","malignant FP: 19.0\n","malignant FN: 14.0\n","\n","\n","\n","Epoch: 5 val \n","Loss: 0.7125 Acc: 56.2500\n","benign precision: 0.0000 recall: 0.0000\n","benign sensitivity: 0.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 56.2500\n","benign TP: 0.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 7.0\n","malignant precision: 56.2500 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 0.0000\n","malignant FPR: 100.0000 NPV: 0.0000\n","malignant TP: 9.0\n","malignant TN: 0.0\n","malignant FP: 7.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 6/50\n","----------\n","\n","Epoch: 6 train \n","Loss: 0.6396 Acc: 55.0725\n","benign precision: 0.0000 recall: 0.0000\n","benign sensitivity: 0.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 59.3750\n","benign TP: 0.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 26.0\n","malignant precision: 59.3750 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 0.0000\n","malignant FPR: 100.0000 NPV: 0.0000\n","malignant TP: 38.0\n","malignant TN: 0.0\n","malignant FP: 26.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 6 val \n","Loss: 0.6673 Acc: 56.2500\n","benign precision: 0.0000 recall: 0.0000\n","benign sensitivity: 0.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 56.2500\n","benign TP: 0.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 7.0\n","malignant precision: 56.2500 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 0.0000\n","malignant FPR: 100.0000 NPV: 0.0000\n","malignant TP: 9.0\n","malignant TN: 0.0\n","malignant FP: 7.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 7/50\n","----------\n","\n","Epoch: 7 train \n","Loss: 0.6158 Acc: 49.2754\n","benign precision: 40.0000 recall: 22.2222\n","benign sensitivity: 22.2222 specificity: 75.6757\n","benign FPR: 24.3243 NPV: 57.1429\n","benign TP: 6.0\n","benign TN: 28.0\n","benign FP: 9.0\n","benign FN: 21.0\n","malignant precision: 57.1429 recall: 75.6757\n","malignant sensitivity: 75.6757 specificity: 22.2222\n","malignant FPR: 77.7778 NPV: 40.0000\n","malignant TP: 28.0\n","malignant TN: 6.0\n","malignant FP: 21.0\n","malignant FN: 9.0\n","\n","\n","\n","Epoch: 7 val \n","Loss: 0.6687 Acc: 75.0000\n","benign precision: 66.6667 recall: 85.7143\n","benign sensitivity: 85.7143 specificity: 66.6667\n","benign FPR: 33.3333 NPV: 85.7143\n","benign TP: 6.0\n","benign TN: 6.0\n","benign FP: 3.0\n","benign FN: 1.0\n","malignant precision: 85.7143 recall: 66.6667\n","malignant sensitivity: 66.6667 specificity: 85.7143\n","malignant FPR: 14.2857 NPV: 66.6667\n","malignant TP: 6.0\n","malignant TN: 6.0\n","malignant FP: 1.0\n","malignant FN: 3.0\n","\n","\n","\n","Epoch 8/50\n","----------\n","\n","Epoch: 8 train \n","Loss: 0.5457 Acc: 69.5652\n","benign precision: 81.8182 recall: 60.0000\n","benign sensitivity: 60.0000 specificity: 88.2353\n","benign FPR: 11.7647 NPV: 71.4286\n","benign TP: 18.0\n","benign TN: 30.0\n","benign FP: 4.0\n","benign FN: 12.0\n","malignant precision: 71.4286 recall: 88.2353\n","malignant sensitivity: 88.2353 specificity: 60.0000\n","malignant FPR: 40.0000 NPV: 81.8182\n","malignant TP: 30.0\n","malignant TN: 18.0\n","malignant FP: 12.0\n","malignant FN: 4.0\n","\n","\n","\n","Epoch: 8 val \n","Loss: 0.5382 Acc: 81.2500\n","benign precision: 70.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 66.6667\n","benign FPR: 33.3333 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 6.0\n","benign FP: 3.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 66.6667\n","malignant sensitivity: 66.6667 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 70.0000\n","malignant TP: 6.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 3.0\n","\n","\n","\n","Epoch 9/50\n","----------\n","\n","Epoch: 9 train \n","Loss: 0.5411 Acc: 59.4203\n","benign precision: 55.5556 recall: 74.0741\n","benign sensitivity: 74.0741 specificity: 56.7568\n","benign FPR: 43.2432 NPV: 75.0000\n","benign TP: 20.0\n","benign TN: 21.0\n","benign FP: 16.0\n","benign FN: 7.0\n","malignant precision: 75.0000 recall: 56.7568\n","malignant sensitivity: 56.7568 specificity: 74.0741\n","malignant FPR: 25.9259 NPV: 55.5556\n","malignant TP: 21.0\n","malignant TN: 20.0\n","malignant FP: 7.0\n","malignant FN: 16.0\n","\n","\n","\n","Epoch: 9 val \n","Loss: 0.7515 Acc: 37.5000\n","benign precision: 38.4615 recall: 71.4286\n","benign sensitivity: 71.4286 specificity: 11.1111\n","benign FPR: 88.8889 NPV: 33.3333\n","benign TP: 5.0\n","benign TN: 1.0\n","benign FP: 8.0\n","benign FN: 2.0\n","malignant precision: 33.3333 recall: 11.1111\n","malignant sensitivity: 11.1111 specificity: 71.4286\n","malignant FPR: 28.5714 NPV: 38.4615\n","malignant TP: 1.0\n","malignant TN: 5.0\n","malignant FP: 2.0\n","malignant FN: 8.0\n","\n","\n","\n","Epoch 10/50\n","----------\n","\n","Epoch: 10 train \n","Loss: 0.5343 Acc: 59.4203\n","benign precision: 56.0976 recall: 82.1429\n","benign sensitivity: 82.1429 specificity: 50.0000\n","benign FPR: 50.0000 NPV: 78.2609\n","benign TP: 23.0\n","benign TN: 18.0\n","benign FP: 18.0\n","benign FN: 5.0\n","malignant precision: 78.2609 recall: 50.0000\n","malignant sensitivity: 50.0000 specificity: 82.1429\n","malignant FPR: 17.8571 NPV: 56.0976\n","malignant TP: 18.0\n","malignant TN: 23.0\n","malignant FP: 5.0\n","malignant FN: 18.0\n","\n","\n","\n","Epoch: 10 val \n","Loss: 0.6291 Acc: 56.2500\n","benign precision: 0.0000 recall: 0.0000\n","benign sensitivity: 0.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 56.2500\n","benign TP: 0.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 7.0\n","malignant precision: 56.2500 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 0.0000\n","malignant FPR: 100.0000 NPV: 0.0000\n","malignant TP: 9.0\n","malignant TN: 0.0\n","malignant FP: 7.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 11/50\n","----------\n","\n","Epoch: 11 train \n","Loss: 0.3603 Acc: 68.1159\n","benign precision: 91.6667 recall: 40.7407\n","benign sensitivity: 40.7407 specificity: 97.2973\n","benign FPR: 2.7027 NPV: 69.2308\n","benign TP: 11.0\n","benign TN: 36.0\n","benign FP: 1.0\n","benign FN: 16.0\n","malignant precision: 69.2308 recall: 97.2973\n","malignant sensitivity: 97.2973 specificity: 40.7407\n","malignant FPR: 59.2593 NPV: 91.6667\n","malignant TP: 36.0\n","malignant TN: 11.0\n","malignant FP: 16.0\n","malignant FN: 1.0\n","\n","\n","\n","Epoch: 11 val \n","Loss: 0.1934 Acc: 93.7500\n","benign precision: 100.0000 recall: 85.7143\n","benign sensitivity: 85.7143 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 90.0000\n","benign TP: 6.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 1.0\n","malignant precision: 90.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 85.7143\n","malignant FPR: 14.2857 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 6.0\n","malignant FP: 1.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 12/50\n","----------\n","\n","Epoch: 12 train \n","Loss: 0.1909 Acc: 85.5072\n","benign precision: 90.0000 recall: 93.1034\n","benign sensitivity: 93.1034 specificity: 91.4286\n","benign FPR: 8.5714 NPV: 94.1176\n","benign TP: 27.0\n","benign TN: 32.0\n","benign FP: 3.0\n","benign FN: 2.0\n","malignant precision: 94.1176 recall: 91.4286\n","malignant sensitivity: 91.4286 specificity: 93.1034\n","malignant FPR: 6.8966 NPV: 90.0000\n","malignant TP: 32.0\n","malignant TN: 27.0\n","malignant FP: 2.0\n","malignant FN: 3.0\n","\n","\n","\n","Epoch: 12 val \n","Loss: 0.1050 Acc: 93.7500\n","benign precision: 100.0000 recall: 85.7143\n","benign sensitivity: 85.7143 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 90.0000\n","benign TP: 6.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 1.0\n","malignant precision: 90.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 85.7143\n","malignant FPR: 14.2857 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 6.0\n","malignant FP: 1.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 13/50\n","----------\n","\n","Epoch: 13 train \n","Loss: 0.0564 Acc: 89.8551\n","benign precision: 100.0000 recall: 92.5926\n","benign sensitivity: 92.5926 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 94.8718\n","benign TP: 25.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 2.0\n","malignant precision: 94.8718 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 92.5926\n","malignant FPR: 7.4074 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 25.0\n","malignant FP: 2.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 13 val \n","Loss: 0.1332 Acc: 93.7500\n","benign precision: 87.5000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 88.8889\n","benign FPR: 11.1111 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 8.0\n","benign FP: 1.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 88.8889\n","malignant sensitivity: 88.8889 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 87.5000\n","malignant TP: 8.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 1.0\n","\n","\n","\n","Epoch 14/50\n","----------\n","\n","Epoch: 14 train \n","Loss: 0.0390 Acc: 91.3043\n","benign precision: 96.4286 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 97.2973\n","benign FPR: 2.7027 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 36.0\n","benign FP: 1.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 97.2973\n","malignant sensitivity: 97.2973 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 96.4286\n","malignant TP: 36.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 1.0\n","\n","\n","\n","Epoch: 14 val \n","Loss: 0.0150 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 15/50\n","----------\n","\n","Epoch: 15 train \n","Loss: 0.0162 Acc: 91.3043\n","benign precision: 100.0000 recall: 96.1538\n","benign sensitivity: 96.1538 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 97.4359\n","benign TP: 25.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 1.0\n","malignant precision: 97.4359 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 96.1538\n","malignant FPR: 3.8462 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 25.0\n","malignant FP: 1.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 15 val \n","Loss: 0.0762 Acc: 93.7500\n","benign precision: 87.5000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 88.8889\n","benign FPR: 11.1111 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 8.0\n","benign FP: 1.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 88.8889\n","malignant sensitivity: 88.8889 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 87.5000\n","malignant TP: 8.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 1.0\n","\n","\n","\n","Epoch 16/50\n","----------\n","\n","Epoch: 16 train \n","Loss: 0.0214 Acc: 91.3043\n","benign precision: 96.5517 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 97.2222\n","benign FPR: 2.7778 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 35.0\n","benign FP: 1.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 97.2222\n","malignant sensitivity: 97.2222 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 96.5517\n","malignant TP: 35.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 1.0\n","\n","\n","\n","Epoch: 16 val \n","Loss: 0.0315 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 17/50\n","----------\n","\n","Epoch: 17 train \n","Loss: 0.0052 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 17 val \n","Loss: 0.1970 Acc: 93.7500\n","benign precision: 100.0000 recall: 85.7143\n","benign sensitivity: 85.7143 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 90.0000\n","benign TP: 6.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 1.0\n","malignant precision: 90.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 85.7143\n","malignant FPR: 14.2857 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 6.0\n","malignant FP: 1.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 18/50\n","----------\n","\n","Epoch: 18 train \n","Loss: 0.0092 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 18 val \n","Loss: 0.3976 Acc: 87.5000\n","benign precision: 77.7778 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 77.7778\n","benign FPR: 22.2222 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 7.0\n","benign FP: 2.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 77.7778\n","malignant sensitivity: 77.7778 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 77.7778\n","malignant TP: 7.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 2.0\n","\n","\n","\n","Epoch 19/50\n","----------\n","\n","Epoch: 19 train \n","Loss: 0.0221 Acc: 91.3043\n","benign precision: 96.4286 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 97.2973\n","benign FPR: 2.7027 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 36.0\n","benign FP: 1.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 97.2973\n","malignant sensitivity: 97.2973 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 96.4286\n","malignant TP: 36.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 1.0\n","\n","\n","\n","Epoch: 19 val \n","Loss: 0.0045 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 20/50\n","----------\n","\n","Epoch: 20 train \n","Loss: 0.0001 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 20 val \n","Loss: 0.0233 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 21/50\n","----------\n","\n","Epoch: 21 train \n","Loss: 0.0004 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 21 val \n","Loss: 0.0371 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 22/50\n","----------\n","\n","Epoch: 22 train \n","Loss: 0.0001 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 22 val \n","Loss: 0.0309 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 23/50\n","----------\n","\n","Epoch: 23 train \n","Loss: 0.0001 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 23 val \n","Loss: 0.0265 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 24/50\n","----------\n","\n","Epoch: 24 train \n","Loss: 0.0000 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 24 val \n","Loss: 0.0246 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 25/50\n","----------\n","\n","Epoch: 25 train \n","Loss: 0.0001 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 25 val \n","Loss: 0.0212 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 26/50\n","----------\n","\n","Epoch: 26 train \n","Loss: 0.0000 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 30.0\n","benign TN: 34.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 34.0\n","malignant TN: 30.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 26 val \n","Loss: 0.0198 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 27/50\n","----------\n","\n","Epoch: 27 train \n","Loss: 0.0000 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 26.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 26.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 27 val \n","Loss: 0.0195 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 28/50\n","----------\n","\n","Epoch: 28 train \n","Loss: 0.0001 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 28 val \n","Loss: 0.0182 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 29/50\n","----------\n","\n","Epoch: 29 train \n","Loss: 0.0001 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 29 val \n","Loss: 0.0163 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 30/50\n","----------\n","\n","Epoch: 30 train \n","Loss: 0.0000 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 26.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 26.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 30 val \n","Loss: 0.0156 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 31/50\n","----------\n","\n","Epoch: 31 train \n","Loss: 0.0000 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 26.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 26.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 31 val \n","Loss: 0.0153 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 32/50\n","----------\n","\n","Epoch: 32 train \n","Loss: 0.0001 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 32 val \n","Loss: 0.0150 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 33/50\n","----------\n","\n","Epoch: 33 train \n","Loss: 0.0001 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 33 val \n","Loss: 0.0145 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 34/50\n","----------\n","\n","Epoch: 34 train \n","Loss: 0.0001 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 34 val \n","Loss: 0.0141 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 35/50\n","----------\n","\n","Epoch: 35 train \n","Loss: 0.0000 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 26.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 26.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 35 val \n","Loss: 0.0140 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 36/50\n","----------\n","\n","Epoch: 36 train \n","Loss: 0.0001 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 36 val \n","Loss: 0.0133 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 37/50\n","----------\n","\n","Epoch: 37 train \n","Loss: 0.0001 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 37 val \n","Loss: 0.0127 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 38/50\n","----------\n","\n","Epoch: 38 train \n","Loss: 0.0001 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 38 val \n","Loss: 0.0123 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 39/50\n","----------\n","\n","Epoch: 39 train \n","Loss: 0.0000 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 39 val \n","Loss: 0.0121 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 40/50\n","----------\n","\n","Epoch: 40 train \n","Loss: 0.0001 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 40 val \n","Loss: 0.0119 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 41/50\n","----------\n","\n","Epoch: 41 train \n","Loss: 0.0000 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 41 val \n","Loss: 0.0119 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 42/50\n","----------\n","\n","Epoch: 42 train \n","Loss: 0.0001 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 29.0\n","benign TN: 35.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 35.0\n","malignant TN: 29.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 42 val \n","Loss: 0.0117 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 43/50\n","----------\n","\n","Epoch: 43 train \n","Loss: 0.0000 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 43 val \n","Loss: 0.0117 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 44/50\n","----------\n","\n","Epoch: 44 train \n","Loss: 0.0000 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 44 val \n","Loss: 0.0117 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 45/50\n","----------\n","\n","Epoch: 45 train \n","Loss: 0.0000 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 45 val \n","Loss: 0.0118 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 46/50\n","----------\n","\n","Epoch: 46 train \n","Loss: 0.0000 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 46 val \n","Loss: 0.0119 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 47/50\n","----------\n","\n","Epoch: 47 train \n","Loss: 0.0001 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 47 val \n","Loss: 0.0119 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 48/50\n","----------\n","\n","Epoch: 48 train \n","Loss: 0.0000 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 26.0\n","benign TN: 38.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 38.0\n","malignant TN: 26.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 48 val \n","Loss: 0.0120 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 49/50\n","----------\n","\n","Epoch: 49 train \n","Loss: 0.0002 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 28.0\n","benign TN: 36.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 36.0\n","malignant TN: 28.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 49 val \n","Loss: 0.0111 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch 50/50\n","----------\n","\n","Epoch: 50 train \n","Loss: 0.0000 Acc: 92.7536\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 27.0\n","benign TN: 37.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 37.0\n","malignant TN: 27.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Epoch: 50 val \n","Loss: 0.0108 Acc: 100.0000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","benign TP: 7.0\n","benign TN: 9.0\n","benign FP: 0.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","malignant TP: 9.0\n","malignant TN: 7.0\n","malignant FP: 0.0\n","malignant FN: 0.0\n","\n","\n","\n","Training complete in 0m 56s\n","Best epoch idx: 50\n","Best epoch train Acc: 92.753623\n","Best epoch val Acc: 100.000000\n","benign precision: 100.0000 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 100.0000\n","benign FPR: 0.0000 NPV: 100.0000\n","malignant precision: 100.0000 recall: 100.0000\n","malignant sensitivity: 100.0000 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 100.0000\n","model trained by GPU (idx:0) has been saved at /home/Pathology_Experiment/saved_models/CLS_ViT_base_timm_PuzzleTuning_SAE_promptstate_PromptDeep_20_401_lf15_finetuning_warwick_CLS.pth\n"]}],"source":["!python Train.py --edge_size 224 --data_augmentation_mode 3 --lr 1e-05 --lrf 0.15 --enable_tensorboard --model_idx ViT_base_timm_PuzzleTuning_SAE_promptstate_PromptDeep_20_401_lf15_finetuning_warwick_CLS --PromptTuning Deep --Prompt_Token_num 20 --PromptUnFreeze --dataroot /data/Pathology_Experiment/dataset/warwick_CLS --draw_root /home/Pathology_Experiment/runs/SAE-timm-start_promptstate_VPT_finetuning_with_timm --Prompt_state_path /home/Pathology_Experiment/saved_models/ViT_b16_224_timm_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_promptstate.pth --model_path /home/Pathology_Experiment/saved_models"]},{"cell_type":"markdown","metadata":{"id":"tjdLl1IgbWJA"},"source":["Test"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"JWODC-3dbWy5"},"outputs":[{"name":"stdout","output_type":"stream","text":["class_names: ['benign', 'malignant']\n","base_state_dict of timm\n","Test the PromptTuning of ViT_base_timm_PuzzleTuning_SAE_promptstate_PromptDeep_20_401_lf15_finetuning_warwick_CLS\n","Prompt VPT type: Deep\n","in prompt model building, timm ViT loaded for base_state_dict\n","model forward cheacked\n","model is ready now!\n","model loaded\n","model : ViT_base_timm_PuzzleTuning_SAE_promptstate_PromptDeep_20_401_lf15_finetuning_warwick_CLS\n","*********************************setting*************************************\n","Namespace(model_idx='ViT_base_timm_PuzzleTuning_SAE_promptstate_PromptDeep_20_401_lf15_finetuning_warwick_CLS', drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, cls_token_off=False, pos_embedding_off=False, att_module='SimAM', gpu_idx=0, dataroot='/data/Pathology_Experiment/dataset/warwick_CLS', model_path='/home/Pathology_Experiment/saved_models', draw_root='/home/Pathology_Experiment/runs/SAE-timm-start_promptstate_VPT_finetuning_with_timm', model_path_by_hand=None, paint=True, enable_tensorboard=False, enable_attention_check=False, enable_visualize_check=False, data_augmentation_mode=3, PromptTuning='Deep', Prompt_Token_num=20, PromptUnFreeze=True, Pre_Trained_model_path=None, num_classes=0, edge_size=224, batch_size=1, check_minibatch=None)\n","Epoch: Test\n","----------\n","Epoch: test test index of 4 minibatch: 1 time used: 2.460519790649414\n","minibatch AVG loss: 0.0005482856868184172\n","Epoch: test test index of 4 minibatch: 2 time used: 0.05422377586364746\n","minibatch AVG loss: 0.028580010335645056\n","Epoch: test test index of 4 minibatch: 3 time used: 0.05495309829711914\n","minibatch AVG loss: 0.00031065010671227355\n","Epoch: test test index of 4 minibatch: 4 time used: 0.05390286445617676\n","minibatch AVG loss: 0.0004322398812064421\n","Epoch: test test index of 4 minibatch: 5 time used: 0.05517458915710449\n","minibatch AVG loss: 0.12173168856133998\n","Epoch: test test index of 4 minibatch: 6 time used: 0.056728363037109375\n","minibatch AVG loss: 0.0001147254761235672\n","Epoch: test test index of 4 minibatch: 7 time used: 0.055054664611816406\n","minibatch AVG loss: 0.00036354226699586434\n","Epoch: test test index of 4 minibatch: 8 time used: 0.05412697792053223\n","minibatch AVG loss: 0.00014341383098326332\n","Epoch: test test index of 4 minibatch: 9 time used: 0.05386781692504883\n","minibatch AVG loss: 0.006190998042256979\n","Epoch: test test index of 4 minibatch: 10 time used: 0.05511641502380371\n","minibatch AVG loss: 0.003244622026613797\n","Epoch: test test index of 4 minibatch: 11 time used: 0.055419206619262695\n","minibatch AVG loss: 0.04551573219941929\n","Epoch: test test index of 4 minibatch: 12 time used: 0.0545496940612793\n","minibatch AVG loss: 0.07759279117362894\n","Epoch: test test index of 4 minibatch: 13 time used: 0.05438709259033203\n","minibatch AVG loss: 1.6769601333962783\n","Epoch: test test index of 4 minibatch: 14 time used: 0.0557103157043457\n","minibatch AVG loss: 0.021024980960646644\n","Epoch: test test index of 4 minibatch: 15 time used: 0.055322885513305664\n","minibatch AVG loss: 0.003789418142332579\n","Epoch: test test index of 4 minibatch: 16 time used: 0.05902862548828125\n","minibatch AVG loss: 0.0001979180588023155\n","Epoch: test test index of 4 minibatch: 17 time used: 0.057543039321899414\n","minibatch AVG loss: 0.003576442887720077\n","Epoch: test test index of 4 minibatch: 18 time used: 0.05739569664001465\n","minibatch AVG loss: 0.0008242135859291011\n","Epoch: test test index of 4 minibatch: 19 time used: 0.05773782730102539\n","minibatch AVG loss: 0.0002545509980791394\n","Epoch: test test index of 4 minibatch: 20 time used: 0.05846238136291504\n","minibatch AVG loss: 0.0012478914995881496\n","\n","Epoch: test \n","Loss: 0.0996 Acc: 98.7500\n","benign precision: 97.3684 recall: 100.0000\n","benign sensitivity: 100.0000 specificity: 97.6744\n","benign FPR: 2.3256 NPV: 100.0000\n","benign TP: 37.0\n","benign TN: 42.0\n","benign FP: 1.0\n","benign FN: 0.0\n","malignant precision: 100.0000 recall: 97.6744\n","malignant sensitivity: 97.6744 specificity: 100.0000\n","malignant FPR: 0.0000 NPV: 97.3684\n","malignant TP: 42.0\n","malignant TN: 37.0\n","malignant FP: 0.0\n","malignant FN: 1.0\n","\n","\n","Testing complete in 0m 4s\n"]}],"source":["!python Test.py --edge_size 224 --data_augmentation_mode 3 --model_idx ViT_base_timm_PuzzleTuning_SAE_promptstate_PromptDeep_20_401_lf15_finetuning_warwick_CLS --PromptTuning Deep --Prompt_Token_num 20 --PromptUnFreeze --dataroot /data/Pathology_Experiment/dataset/warwick_CLS --draw_root /home/Pathology_Experiment/runs/SAE-timm-start_promptstate_VPT_finetuning_with_timm --model_path /home/Pathology_Experiment/saved_models"]},{"cell_type":"markdown","metadata":{"id":"XX6Vjy9ec2b2"},"source":["# check the Tensorboard output"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"MWtYyRI1ff0q"},"outputs":[{"data":{"application/javascript":["\n"," (async () =\u003e {\n"," const url = new URL(await google.colab.kernel.proxyPort(6006, {'cache': true}));\n"," url.searchParams.set('tensorboardColab', 'true');\n"," const iframe = document.createElement('iframe');\n"," iframe.src = url;\n"," iframe.setAttribute('width', '100%');\n"," iframe.setAttribute('height', '800');\n"," iframe.setAttribute('frameborder', 0);\n"," document.body.appendChild(iframe);\n"," })();\n"," "],"text/plain":["\u003cIPython.core.display.Javascript object\u003e"]},"metadata":{},"output_type":"display_data"}],"source":["%load_ext tensorboard\n","%tensorboard --logdir '/home/Pathology_Experiment/runs'"]},{"cell_type":"markdown","metadata":{"id":"XSGgbUQ3E0H5"},"source":["# After the task, save the output to google drive\n"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"ZQlei_AVLknG"},"outputs":[{"name":"stdout","output_type":"stream","text":["usage: check_log_json.py [-h] [--ONE_LOG] [--draw_root DRAW_ROOT] [--record_dir RECORD_DIR]\n","check_log_json.py: error: unrecognized arguments: --enable_notify\n"]}],"source":["# change working dir\n","import os\n","os.chdir(\"/home/Pathology_Experiment/code/utils\")\n","!python check_log_json.py --enable_notify --draw_root /home/Pathology_Experiment/runs --record_dir /home/Pathology_Experiment/CSV_logs"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"_Wx0ymiiEuyS"},"outputs":[{"name":"stdout","output_type":"stream","text":["/bin/cp: target '/content/drive/MyDrive/Pathology_Experiment/runs/' is not a directory\n","runs copy completed!\n","/bin/cp: target '/content/drive/MyDrive/Pathology_Experiment/saved_models/' is not a directory\n","models copy completed!\n","/bin/cp: target '/content/drive/MyDrive/Pathology_Experiment/imaging_results/' is not a directory\n","imaging_results copy completed!\n"]}],"source":["# copy tensorboard runs\n","!/bin/cp -rf /home/Pathology_Experiment/runs/* /content/drive/MyDrive/Pathology_Experiment/runs/\n","print('runs copy completed!')\n","# copy the traind models\n","!/bin/cp -rf /home/Pathology_Experiment/saved_models/* /content/drive/MyDrive/Pathology_Experiment/saved_models/\n","print('models copy completed!')\n","# copy the imaging_results\n","!/bin/cp -rf /home/Pathology_Experiment/imaging_results/* /content/drive/MyDrive/Pathology_Experiment/imaging_results/\n","print('imaging_results copy completed!')"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"9lzAtLIhnGe5"},"outputs":[{"name":"stdout","output_type":"stream","text":["Sat Nov 25 08:22:43 AM UTC 2023\n"]}],"source":["!date --date='+8 hour' # CST time zone"]}],"metadata":{"accelerator":"GPU","colab":{"machine_shape":"hm","name":"","version":""},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0} \ No newline at end of file diff --git a/PuzzleTuning/PuzzleTuning.py b/PuzzleTuning/PuzzleTuning.py new file mode 100644 index 0000000000000000000000000000000000000000..c5084bfbf7188ade6bf4e746b6696c38759bdc99 --- /dev/null +++ b/PuzzleTuning/PuzzleTuning.py @@ -0,0 +1,455 @@ +""" +Puzzle Tuning Script ver: Feb 11th 14:00 + +Paper: +https://arxiv.org/abs/2311.06712 +Code: +https://github.com/sagizty/PuzzleTuning +Ref: MAE +https://github.com/facebookresearch/mae + +Step 1: PreTraining on the ImagetNet-1k style dataset (others) +Step 2: Domain Prompt Tuning (PuzzleTuning) on Pathological Images (in ImageFolder) +Step 3: FineTuning on the Downstream Tasks + +This is the training code for step 2 + + +Pre-training Experiments: +DP (data-parallel bash) +python PuzzleTuning.py --batch_size 64 --blr 1.5e-4 --epochs 200 --accum_iter 2 --print_freq 2000 --check_point_gap 50 +--input_size 224 --warmup_epochs 20 --pin_mem --num_workers 32 --strategy loop --PromptTuning Deep --basic_state_dict +/data/saved_models/ViT_b16_224_Imagenet.pth +--data_path /root/datasets/All + +DDP (distributed data-parallel bash) for one machine with 12 GPU +python -m torch.distributed.launch --nproc_per_node=12 --nnodes 1 --node_rank 0 PuzzleTuning.py --DDP_distributed +--batch_size 64 --blr 1.5e-4 --epochs 200 --accum_iter 2 --print_freq 2000 --check_point_gap 50 --input_size 224 +--warmup_epochs 20 --pin_mem --num_workers 32 --strategy loop --PromptTuning Deep --basic_state_dict +/data/saved_models/ViT_b16_224_Imagenet.pth +--data_path /root/datasets/All + + +update: +Use "--seg_decoder" parameter to introduce segmentation networks +swin_unet for Swin-Unet +""" + +import argparse +import datetime +import json +import numpy as np +import os +import time +from pathlib import Path + +import torch +import torch.backends.cudnn as cudnn +from tensorboardX import SummaryWriter +import torchvision.transforms as transforms +import torchvision.datasets as datasets + +import timm + +# assert timm.__version__ == "0.3.2" # version check +import timm.optim.optim_factory as optim_factory + +import SSL_structures.misc as misc +from SSL_structures.misc import NativeScalerWithGradNormCount as NativeScaler +from utils.schedulers import patch_scheduler, ratio_scheduler + +from SSL_structures import models_mae, SAE + +from SSL_structures.engine_pretrain import train_one_epoch + + +def main(args): + # choose encoder for timm + basic_encoder = args.model[4:] + + # choose decoder version + args.model = args.model + '_decoder' if args.seg_decoder is not None else args.model + # note decoder + args.model_idx = args.model_idx + args.model + '_' + args.seg_decoder if args.seg_decoder is not None \ + else args.model_idx + args.model + # note PromptTuning + args.model_idx = args.model_idx + '_Prompt_' + args.PromptTuning + '_tokennum_' + str(args.Prompt_Token_num) \ + if args.PromptTuning is not None else args.model_idx + + # fix the seed for reproducibility + if args.DDP_distributed: + misc.init_distributed_mode(args) + seed = args.seed + misc.get_rank() + else: + seed = args.seed + torch.manual_seed(seed) + np.random.seed(seed) + + # set GPUs + cudnn.benchmark = True + device = torch.device(args.device) # cuda + + # simple augmentation + transform_train = transforms.Compose([ + transforms.RandomResizedCrop(args.input_size, scale=(0.8, 1.0), interpolation=3, ratio=(1. / 1., 1. / 1.)), + # 3 is bicubic + # transforms.Resize(args.input_size), + transforms.RandomVerticalFlip(), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ]) + + dataset_train = datasets.ImageFolder(os.path.join(args.data_path), transform=transform_train) # , 'train' + print('dataset_train:', dataset_train) # Train data + + if args.DDP_distributed: # args.DDP_distributed is True we use distributed data parallel(DDP) + num_tasks = misc.get_world_size() # use misc to set up DDP + global_rank = misc.get_rank() # get the rank of the current running + + sampler_train = torch.utils.data.DistributedSampler( + dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) + print("Sampler_train = %s" % str(sampler_train)) + enable_DistributedSampler = True + batch_size_for_Dataloader = args.batch_size + + else: # Data parallel(DP) instead of distributed data parallel(DDP) + global_rank = 0 + sampler_train = torch.utils.data.RandomSampler(dataset_train) + enable_DistributedSampler = False + batch_size_for_Dataloader = args.batch_size * torch.cuda.device_count() + + # set log on the main process + if global_rank == 0 and args.log_dir is not None: + args.log_dir = os.path.join(args.log_dir, args.model_idx) + os.makedirs(args.log_dir, exist_ok=True) + log_writer = SummaryWriter(log_dir=args.log_dir) # Tensorboard + + print('Task: ' + args.model_idx) + print("Use", torch.cuda.device_count(), "GPUs!") + print('job AImageFolderDir: {}'.format(os.path.dirname(os.path.realpath(__file__)))) + print("{}".format(args).replace(', ', ',\n')) + else: + log_writer = None + + # output_dir + if args.output_dir is not None: + args.output_dir = os.path.join(args.output_dir, args.model_idx) + os.makedirs(args.output_dir, exist_ok=True) + os.makedirs(os.path.join(args.output_dir, 'figs'), exist_ok=True) + print('Training output files will be at', args.output_dir) + else: + print('no out put path specified!') + raise + + data_loader_train = torch.utils.data.DataLoader( + dataset_train, sampler=sampler_train, # the shuffle=True is already set in the sampler + batch_size=batch_size_for_Dataloader, + num_workers=args.num_workers, + pin_memory=args.pin_mem, + drop_last=True) + + # define the model + if args.model[0:3] == 'mae': + + if args.basic_state_dict is not None: # Transfer-learning + try: + if args.basic_state_dict == 'timm': + basic_model = timm.create_model('vit_base_patch' + str(16) + '_' + str(args.input_size), + pretrained=True) + basic_state_dict = basic_model.state_dict() + print('MAE Transfer-learning with timm') + else: + basic_state_dict = torch.load(args.basic_state_dict) + if 'model' in basic_state_dict: + basic_state_dict = basic_state_dict['model'] + except: + print('erro in args.basic_state_dict:', args.basic_state_dict) + if args.PromptTuning is not None: + print( + 'In PromptTuning, the basic_state_dict is required, without specification now, timm loaded.\n') + # timm model name basic_encoder + basic_model = timm.create_model(basic_encoder + '_' + str(args.input_size), pretrained=True) + basic_state_dict = basic_model.state_dict() + else: + basic_state_dict = None + print('MAE Restart with a empty backbone') + else: + print('MAE Transfer-learning with:', args.basic_state_dict) + + else: + if args.PromptTuning is not None: + print('In PromptTuning, the basic_state_dict is required, without specification now, timm loaded.\n') + # timm model name basic_encoder + basic_model = timm.create_model(basic_encoder + '_' + str(args.input_size), pretrained=True) + basic_state_dict = basic_model.state_dict() + else: + basic_state_dict = None + print('MAE Restart with a empty backbone') + + # mae-vit-base-patch16 + model = models_mae.__dict__[args.model](img_size=args.input_size, norm_pix_loss=args.norm_pix_loss, + prompt_mode=args.PromptTuning, Prompt_Token_num=args.Prompt_Token_num, + basic_state_dict=basic_state_dict, dec_idx=args.seg_decoder) + # setting puzzle_patch_size to not use SAE + puzzle_patch_size_scheduler = None + fix_position_ratio_scheduler = None + + # PuzzleTuning + elif args.model[0:3] == 'sae': + if args.basic_state_dict is not None: + try: + if args.basic_state_dict == 'timm': + print("using timm") + basic_model = timm.create_model(basic_encoder + '_' + str(args.input_size), pretrained=True) + basic_state_dict = basic_model.state_dict() + else: + basic_state_dict = torch.load(args.basic_state_dict) + except: + print('erro in args.basic_state_dict:', args.basic_state_dict) + if args.PromptTuning is not None: + print( + 'In PromptTuning, the basic_state_dict is required, without specification now, timm loaded.\n') + # timm model name basic_encoder + basic_model = timm.create_model(basic_encoder + '_' + str(args.input_size), pretrained=True) + basic_state_dict = basic_model.state_dict() + else: + basic_state_dict = None + print('SAE Restart with a empty backbone') + else: + print('Puzzle tuning with Transfer-learning:', args.basic_state_dict) + else: + if args.PromptTuning is not None: + print('In PromptTuning, the basic_state_dict is required, without specification now, timm loaded.\n') + # timm model name basic_encoder + basic_model = timm.create_model(basic_encoder + '_' + str(args.input_size), pretrained=True) + basic_state_dict = basic_model.state_dict() + else: + basic_state_dict = None + print('Puzzle tuning with a empty backbone') + + model = SAE.__dict__[args.model](img_size=args.input_size, group_shuffle_size=args.group_shuffle_size, + norm_pix_loss=args.norm_pix_loss, + prompt_mode=args.PromptTuning, Prompt_Token_num=args.Prompt_Token_num, + basic_state_dict=basic_state_dict, dec_idx=args.seg_decoder) + + fix_position_ratio_scheduler = ratio_scheduler(total_epoches=args.epochs, + warmup_epochs=args.warmup_epochs, + basic_ratio=0.25, # start ratio + fix_position_ratio=args.fix_position_ratio, # None + strategy=args.strategy) + # strategy=None for fixed else reduce ratio gradually + + # setting puzzle_patch_size to not use MAE + puzzle_patch_size_scheduler = patch_scheduler(total_epoches=args.epochs, + warmup_epochs=args.warmup_epochs, + edge_size=args.input_size, + basic_patch=model.patch_embed.patch_size[0], + fix_patch_size=args.fix_patch_size, # None + strategy=args.strategy) # 'linear' + # NOTICE strategy are used for setting up both the ratio-scheduler and patch-scheduler + + else: + print('This Tuning script only support SAE(PuzzleTuning) or MAE') + return -1 + + # the effective batch size for setting up lr + if args.DDP_distributed: + eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() + else: + eff_batch_size = args.batch_size * torch.cuda.device_count() * args.accum_iter + print('eff_batch_size:', eff_batch_size) + + if args.lr is None: # when only base_lr is specified + args.lr = args.blr * eff_batch_size / 256 + + print("base lr: %.2e" % (args.lr * 256 / eff_batch_size)) + print("actual lr: %.2e" % args.lr) + + print("accumulate grad iterations: %d" % args.accum_iter) + print("effective batch size: %d" % eff_batch_size) + + # take the model parameters for optimizer update + model_without_ddp = model + + if args.DDP_distributed: + model.cuda() # args.gpu is obtained by misc.py + # find_unused_parameters=True for the DDP to correctly synchronize layers in back propagation + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) + else: + model = torch.nn.DataParallel(model) + model.to(device) + + print("Model = %s" % str(model_without_ddp)) + + # following timm: set wd as 0 for bias and norm layers + param_groups = optim_factory.add_weight_decay(model_without_ddp, args.weight_decay) + optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) + print(optimizer) + + # loss scaler with gradient clipping + loss_scaler = NativeScaler(GPU_count=torch.cuda.device_count(), DDP_distributed=args.DDP_distributed) + + # if we have --resume,we will load the checkpoint and continue training, if not, we start a new training + # the checkpoint should include model, optimizer, loss_scaler information + misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler) + + # Training by epochs + print(f"Start training for {args.epochs} epochs") + start_time = time.time() + + for epoch in range(args.start_epoch, args.epochs): + # use args.start_epoch to jump to resume checkpoint + + if enable_DistributedSampler: # DistributedSampler need to .set_epoch(epoch) at each epoch + data_loader_train.sampler.set_epoch(epoch) + + # training iterations + train_stats = train_one_epoch(model, data_loader_train, optimizer, device, epoch, loss_scaler, + fix_position_ratio_scheduler=fix_position_ratio_scheduler, + puzzle_patch_size_scheduler=puzzle_patch_size_scheduler, + check_samples=args.check_samples, + print_freq=args.print_freq, log_writer=log_writer, args=args) + + if args.output_dir and (epoch % args.check_point_gap == 0 or epoch + 1 == args.epochs): + misc.save_model(args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, + loss_scaler=loss_scaler, epoch=epoch, model_idx=args.model_idx) + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + 'epoch': epoch, } + + # Write log + if args.output_dir and misc.is_main_process(): + if log_writer is not None: + log_writer.flush() + with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: + f.write(json.dumps(log_stats) + "\n") + + # time stamp + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + +def get_args_parser(): + parser = argparse.ArgumentParser('SAE pre-training', add_help=False) + + # disable_notify + parser.add_argument('--disable_notify', action='store_true', help='do not send email of tracking') + + # Model Name or index + parser.add_argument('--model_idx', default='PuzzleTuning_', type=str, help='Model Name or index') + + # Original MAE(224->64) MAE A100(224->256 384->128)SAE(224->128 384->64)SAE-VPT(224->256 384->128) + parser.add_argument('--batch_size', default=64, type=int, + help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus') + parser.add_argument('--epochs', default=200, type=int) # epochs原800 + parser.add_argument('--accum_iter', default=2, type=int, + help='Accumulate gradient iterations ' + '(for increasing the effective batch size under memory constraints)') + + # if we have --resume,we will load the checkpoint and continue training, if not, we start a new training + # the checkpoint should include model, optimizer, loss_scaler information + parser.add_argument('--resume', default='', help='resume from checkpoint') + parser.add_argument('--start_epoch', default=0, type=int, metavar='N', help='start epoch of checkpoint') + + # Model parameters sae_vit_base_patch16 mae_vit_base_patch16 + parser.add_argument('--model', default='sae_vit_base_patch16', type=str, metavar='MODEL', + help='Name of model to train') # ori mae_vit_large_patch16 + parser.add_argument('--seg_decoder', default=None, type=str, metavar='segmentation decoder', + help='Name of segmentation decoder') + + parser.add_argument('--input_size', default=224, type=int,help='images input size') + parser.add_argument('--model_patch_size', default=16, type=int, + help='model_patch_size, default 16 for ViT-base') + parser.add_argument('--num_classes', default=3, type=int, # decoder seg class set to channel + help='the number of classes for segmentation') + + # MAE mask_ratio + parser.add_argument('--mask_ratio', default=0.75, type=float, + help='Masking ratio (percentage of removed patches)') + + # Tuning setting + # PromptTuning + parser.add_argument('--PromptTuning', default=None, type=str, + help='use Prompt Tuning strategy (Deep/Shallow) instead of Finetuning (None, by default)') + # Prompt_Token_num + parser.add_argument('--Prompt_Token_num', default=20, type=int, help='Prompt_Token_num for VPT backbone') + + # Course learning setting + parser.add_argument('--strategy', default=None, type=str, + help='use linear or other puzzle size scheduler') + parser.add_argument('--fix_position_ratio', default=None, type=float, + help='ablation fix_position_ratio (percentage of position token patches)') + parser.add_argument('--fix_patch_size', default=None, type=int, help='ablation using fix_patch_size') + parser.add_argument('--group_shuffle_size', default=-1, type=int, help='group_shuffle_size of group shuffling,' + 'default -1 for the whole batch as a group') + + # loss settings + parser.add_argument('--norm_pix_loss', action='store_true', + help='Use (per-patch) normalized pixels as targets for computing loss') + parser.set_defaults(norm_pix_loss=False) + + # basic_state_dict + parser.add_argument('--basic_state_dict', default=None, type=str, + help='load basic backbone state_dict for Transfer-learning-based tuning, default None') + + # Optimizer settings + parser.add_argument('--weight_decay', type=float, default=0.05, + help='weight decay (default: 0.05)') + + parser.add_argument('--lr', type=float, default=None, metavar='LR', + help='learning rate (absolute lr), default=None') + parser.add_argument('--blr', type=float, default=1.5e-4, metavar='LR', + help='base learning rate: absolute_lr = base_lr * effective batch size / 256') + parser.add_argument('--min_lr', type=float, default=0., metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0') + + parser.add_argument('--warmup_epochs', type=int, default=20, metavar='N', + help='epochs to warmup LR') + + # PATH settings + # Dataset parameters /datasets01/imagenet_full_size/061417/ /data/imagenet_1k /root/autodl-tmp/imagenet + parser.add_argument('--data_path', default='/root/autodl-tmp/datasets/All', type=str, help='dataset path') + parser.add_argument('--output_dir', default='/root/autodl-tmp/runs', + help='path where to save, empty for no saving') + parser.add_argument('--log_dir', default='/root/tf-logs', + help='path where to tensorboard log') + parser.add_argument('--device', default='cuda', + help='device to use for training / testing') + parser.add_argument('--seed', default=42, type=int) + + # dataloader setting + parser.add_argument('--num_workers', default=20, type=int) + # 4A100(16,384,b128, shm40)6A100(36,384,b128, shm100) 8A100(35,384,b128, shm100) + parser.add_argument('--pin_mem', action='store_true', + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') + parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') + parser.set_defaults(pin_mem=True) + + # print_freq and checkpoint + parser.add_argument('--print_freq', default=20, type=int) + parser.add_argument('--check_point_gap', default=50, type=int) + parser.add_argument('--check_samples', default=1, type=int, help='check how many images in a checking batch') + + # DDP_distributed training parameters for DDP + parser.add_argument('--world_size', default=1, type=int, + help='number of DDP_distributed processes') + parser.add_argument('--local_rank', default=-1, type=int) + parser.add_argument('--dist_on_itp', action='store_true') + parser.add_argument('--dist_url', default='env://', + help='url used to set up DDP_distributed training') + parser.add_argument('--DDP_distributed', action='store_true', help='Use DDP in training. ' + 'without calling, DP with be applied') + + return parser + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + + if args.output_dir: + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + + main(args) \ No newline at end of file diff --git a/PuzzleTuning/README.md b/PuzzleTuning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f7e2cc81f043affc9007d91cce8f85f01b78fbad --- /dev/null +++ b/PuzzleTuning/README.md @@ -0,0 +1,49 @@ +# PuzzleTuning +[[`arXiv`](https://arxiv.org/abs/2311.06712)] [[`Vidio Presentation`](https://www.bilibili.com/video/BV1ZVHje9EdX)] + +Pathological image analysis is a crucial field in computer vision. Due to the annotation scarcity in the pathological field, recently, most of the works have leveraged self-supervised learning (SSL) trained on unlabeled pathological images, hoping to mine the representation effectively. However, there are two core defects in current SSL-based pathological pre-training: (1) they do not explicitly explore the essential focuses of the pathological field, and (2) they do not effectively bridge with and thus take advantage of the knowledge from natural images. To explicitly address them, we propose our large-scale PuzzleTuning framework, containing the following innovations. Firstly, we define three task focuses that can effectively bridge knowledge of pathological and natural domain: appearance consistency, spatial consistency, and restoration understanding. Secondly, we devise a novel multiple puzzle restoring task, which explicitly pre-trains the model regarding these focuses. Thirdly, we introduce an explicit prompt-tuning process to incrementally integrate the domain-specific knowledge. It builds a bridge to align the large domain gap between natural and pathological images. Additionally, a curriculum-learning training strategy is designed to regulate task difficulty, making the model adaptive to the puzzle restoring complexity. Experimental results show that our PuzzleTuning framework outperforms the previous state-of-the-art methods in various downstream tasks on multiple datasets. + +fig_concept +Samples illustrate the focuses and relationships in pathological images. They are pancreatic liquid samples (a and b) and colonic epithelium tissue samples (c and d) of normal (a and c) and cancer conditions (b and d). The patches of them are numbered from 1 to 9. Grouping the patches from each image as a bag, after intermixing patches among them, the three pathological focuses of appearance consistency, spatial consistency, and restoration understanding are highlighted. + +fig_PuzzleTuning_method +Overview of PuzzleTuning. Three steps are designed in PuzzleTuning: 1) Puzzle making, where image batch are divided into bags of patches and fix-position and relation identity are randomly assigned. The relation patches are then in-place shuffled with each other, making up the puzzle state. 2) Puzzle understanding, where puzzles regarding grouping, junction, and restoration relationships are learned by prompt tokens attached to the encoder. Through the prompt tokens, the pathological focuses are explicitly seamed with general vision knowledge. 3) Puzzle restoring, where the decoder restores the relation patches with position patches as hint, under SSL supervision against original images. + + +# Usage +## pre-trained weights +we have updated the pre-trained weight of PuzzleTuning and all counterparts at + +https://drive.google.com/file/d/1-mddejIdCRP5AscnlWAyEcGzfgBIRCSf/view?usp=share_link + +## demo with Colab +we have updated a demo for iullustration at + +https://github.com/sagizty/PuzzleTuning/blob/main/PuzzleTuning%20Colab%20Demo.ipynb + +## training script +```Shell +python -m torch.distributed.launch --nproc_per_node=8 --nnodes 1 --node_rank 0 PuzzleTuning.py --DDP_distributed --batch_size 64 --group_shuffle_size 8 --blr 1.5e-4 --epochs 2000 --accum_iter 2 --print_freq 5000 --check_point_gap 100 --input_size 224 --warmup_epochs 100 --pin_mem --num_workers 32 --strategy loop --PromptTuning Deep --basic_state_dict /home/saved_models/ViT_b16_224_Imagenet.pth --data_path /home/datasets/All +``` + +## CPIA dataset +https://github.com/zhanglab2021/CPIA_Dataset + +# Results +## Comparison +image +image + +## Domain bridging target +image + +## Domain bridging with Puzzles and Prompts +Screenshot 2023-10-28 at 4 42 31 PM +Screenshot 2023-10-28 at 4 43 02 PM + +image + +## Curiculum learning +Screenshot 2023-10-28 at 4 43 36 PM + +image diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/LICENSE b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..2ef27144b58b2accb943fda2cc282ca5ce407568 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Jeya Maria Jose + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/README.md b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9a6693edd7612c5e739aa0734df2f856ddd35199 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/README.md @@ -0,0 +1,152 @@ +# Medical-Transformer + + + +Pytorch code for the paper +["Medical Transformer: Gated Axial-Attention for +Medical Image Segmentation"](https://arxiv.org/pdf/2102.10662.pdf), MICCAI 2021 + +[Paper](https://arxiv.org/pdf/2102.10662.pdf) | [Poster](https://drive.google.com/file/d/1gMjc5guT_dYQFT6TEEwdHAFKwG5XkEc9/view?usp=sharing) + +### About this repo: + +This repo hosts the code for the following networks: + +1) Gated Axial Attention U-Net +2) MedT + +## Introduction + +Majority of existing Transformer-based network architectures proposed for vision applications require large-scale +datasets to train properly. However, compared to the datasets for vision +applications, for medical imaging the number of data samples is relatively +low, making it difficult to efficiently train transformers for medical appli- +cations. To this end, we propose a Gated Axial-Attention model which +extends the existing architectures by introducing an additional control +mechanism in the self-attention module. Furthermore, to train the model +effectively on medical images, we propose a Local-Global training strat- +egy (LoGo) which further improves the performance. Specifically, we op- +erate on the whole image and patches to learn global and local features, +respectively. The proposed Medical Transformer (MedT) uses LoGo training strategy on Gated Axial Attention U-Net. + +

+ +

+ +### Using the code: + +- Clone this repository: +```bash +git clone https://github.com/jeya-maria-jose/Medical-Transformer +cd Medical-Transformer +``` + +The code is stable using Python 3.6.10, Pytorch 1.4.0 + +To install all the dependencies using conda: + +```bash +conda env create -f environment.yml +conda activate medt +``` + +To install all the dependencies using pip: + +```bash +pip install -r requirements.txt +``` + +### Links for downloading the public Datasets: + +1) MoNuSeG Dataset - Link (Original) +2) GLAS Dataset - Link (Original) +3) Brain Anatomy US dataset from the paper will be made public soon ! + +## Using the Code for your dataset + +### Dataset Preparation + +Prepare the dataset in the following format for easy use of the code. The train and test folders should contain two subfolders each: img and label. Make sure the images their corresponding segmentation masks are placed under these folders and have the same name for easy correspondance. Please change the data loaders to your need if you prefer not preparing the dataset in this format. + + + +```bash +Train Folder----- + img---- + 0001.png + 0002.png + ....... + labelcol--- + 0001.png + 0002.png + ....... +Validation Folder----- + img---- + 0001.png + 0002.png + ....... + labelcol--- + 0001.png + 0002.png + ....... +Test Folder----- + img---- + 0001.png + 0002.png + ....... + labelcol--- + 0001.png + 0002.png + ....... + +``` + +- The ground truth images should have pixels corresponding to the labels. Example: In case of binary segmentation, the pixels in the GT should be 0 or 255. + +### Training Command: + +```bash +python train.py --train_dataset "enter train directory" --val_dataset "enter validation directory" --direc 'path for results to be saved' --batch_size 4 --epoch 400 --save_freq 10 --modelname "gatedaxialunet" --learning_rate 0.001 --imgsize 128 --gray "no" +``` + +```bash +Change modelname to MedT or logo to train them +``` + +### Testing Command: + +```bash +python test.py --loaddirec "./saved_model_path/model_name.pth" --val_dataset "test dataset directory" --direc 'path for results to be saved' --batch_size 1 --modelname "gatedaxialunet" --imgsize 128 --gray "no" +``` + +The results including predicted segmentations maps will be placed in the results folder along with the model weights. Run the performance metrics code in MATLAB for calculating F1 Score and mIoU. + +### Notes: + +1)Note that these experiments were conducted in Nvidia Quadro 8000 with 48 GB memory. +2)Google Colab Code is an unofficial implementation for quick train/test. Please follow original code for proper training. + +### Acknowledgement: + +The dataloader code is inspired from pytorch-UNet . The axial attention code is developed from axial-deeplab. + +# Citation: + +```bash +@InProceedings{jose2021medical, +author="Valanarasu, Jeya Maria Jose +and Oza, Poojan +and Hacihaliloglu, Ilker +and Patel, Vishal M.", +title="Medical Transformer: Gated Axial-Attention for Medical Image Segmentation", +booktitle="Medical Image Computing and Computer Assisted Intervention -- MICCAI 2021", +year="2021", +publisher="Springer International Publishing", +address="Cham", +pages="36--46", +isbn="978-3-030-87193-2" +} + +``` + +Open an issue or mail me directly in case of any queries or suggestions. diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/__init__.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/cmd.txt b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/cmd.txt new file mode 100644 index 0000000000000000000000000000000000000000..2437302889786ed5fcf084030aff8f1104e20166 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/cmd.txt @@ -0,0 +1,2 @@ +python train.py --train_dataset "/media/jeyamariajose/7888230b-5c10-4229-90f2-c78bdae9c5de/Data/Brain_Ultrasound/Final/resized/train/" --val_dataset "/media/jeyamariajose/7888230b-5c10-4229-90f2-c78bdae9c5de/Data/Brain_Ultrasound/Final/resized/test/" --direc "./results/axial128_en/" --batch_size 4 --modelname "logo" --epoch 401 --save_freq 50 --learning_rate 0.0001 --imgsize 128 + diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/environment.yml b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..384ddd33b36321d13f85bd41d4b612535e36be0b --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/environment.yml @@ -0,0 +1,133 @@ +name: medt +channels: + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - argon2-cffi=20.1.0=py36h8c4c3a4_1 + - attrs=20.1.0=pyh9f0ad1d_0 + - backcall=0.2.0=pyh9f0ad1d_0 + - backports=1.0=py_2 + - backports.functools_lru_cache=1.6.1=py_0 + - blas=1.0=mkl + - bleach=3.1.5=pyh9f0ad1d_0 + - brotlipy=0.7.0=py36h8c4c3a4_1000 + - ca-certificates=2020.6.20=hecda079_0 + - certifi=2020.6.20=py36h9f0ad1d_0 + - cffi=1.11.5=py36_0 + - chardet=3.0.4=py36h9f0ad1d_1006 + - cryptography=3.1=py36h45558ae_0 + - decorator=4.4.2=py_0 + - defusedxml=0.6.0=py_0 + - entrypoints=0.3=py36h9f0ad1d_1001 + - idna=2.10=pyh9f0ad1d_0 + - importlib-metadata=1.7.0=py36h9f0ad1d_0 + - importlib_metadata=1.7.0=0 + - intel-openmp=2020.1=217 + - ipykernel=5.3.4=py36h95af2a2_0 + - ipython=7.16.1=py36h95af2a2_0 + - ipython_genutils=0.2.0=py_1 + - ipywidgets=7.5.1=py_0 + - jedi=0.17.2=py36h9f0ad1d_0 + - jinja2=2.11.2=pyh9f0ad1d_0 + - json5=0.9.4=pyh9f0ad1d_0 + - jsonschema=3.2.0=py36h9f0ad1d_1 + - jupyter_client=6.1.7=py_0 + - jupyter_core=4.6.3=py36h9f0ad1d_1 + - jupyterlab=2.2.6=py_0 + - jupyterlab_server=1.2.0=py_0 + - ld_impl_linux-64=2.33.1=h53a641e_7 + - libedit=3.1.20191231=h7b6447c_0 + - libffi=3.3=he6710b0_1 + - libgcc-ng=9.1.0=hdf63c60_0 + - libgfortran-ng=7.3.0=hdf63c60_0 + - libsodium=1.0.18=h516909a_0 + - libstdcxx-ng=9.1.0=hdf63c60_0 + - markupsafe=1.1.1=py36h8c4c3a4_1 + - mistune=0.8.4=py36h8c4c3a4_1001 + - mkl=2020.1=217 + - mkl-service=2.3.0=py36he904b0f_0 + - mkl_fft=1.1.0=py36h23d657b_0 + - mkl_random=1.1.1=py36h0573a6f_0 + - nbconvert=5.6.1=py36h9f0ad1d_1 + - nbformat=5.0.7=py_0 + - ncurses=6.2=he6710b0_1 + - notebook=6.1.3=py36h9f0ad1d_0 + - numpy=1.18.5=py36ha1c710e_0 + - numpy-base=1.18.5=py36hde5b4d6_0 + - openssl=1.1.1g=h516909a_1 + - packaging=20.4=pyh9f0ad1d_0 + - pandoc=2.10.1=h516909a_0 + - pandocfilters=1.4.2=py_1 + - parso=0.7.1=pyh9f0ad1d_0 + - pexpect=4.8.0=py36h9f0ad1d_1 + - pickleshare=0.7.5=py36h9f0ad1d_1001 + - pip=20.1.1=py36_1 + - prometheus_client=0.8.0=pyh9f0ad1d_0 + - prompt-toolkit=3.0.7=py_0 + - ptyprocess=0.6.0=py_1001 + - pycparser=2.20=pyh9f0ad1d_2 + - pygments=2.6.1=py_0 + - pyopenssl=19.1.0=py_1 + - pyparsing=2.4.7=pyh9f0ad1d_0 + - pyrsistent=0.16.0=py36h8c4c3a4_0 + - pysocks=1.7.1=py36h9f0ad1d_1 + - python=3.6.10=h7579374_2 + - python-dateutil=2.8.1=py_0 + - python_abi=3.6=1_cp36m + - pyzmq=19.0.2=py36h9947dbf_0 + - readline=8.0=h7b6447c_0 + - requests=2.24.0=pyh9f0ad1d_0 + - send2trash=1.5.0=py_0 + - setuptools=47.3.1=py36_0 + - six=1.15.0=py_0 + - sqlite=3.32.3=h62c20be_0 + - terminado=0.8.3=py36h9f0ad1d_1 + - testpath=0.4.4=py_0 + - tk=8.6.10=hbc83047_0 + - tornado=6.0.4=py36h8c4c3a4_1 + - traitlets=4.3.3=py36h9f0ad1d_1 + - urllib3=1.25.10=py_0 + - wcwidth=0.2.5=pyh9f0ad1d_1 + - webencodings=0.5.1=py_1 + - wheel=0.34.2=py36_0 + - widgetsnbextension=3.5.1=py36h9f0ad1d_1 + - xz=5.2.5=h7b6447c_0 + - yaml=0.2.5=h7b6447c_0 + - zeromq=4.3.2=he1b5a44_3 + - zipp=3.1.0=py_0 + - zlib=1.2.11=h7b6447c_3 + - pip: + - ci-info==0.2.0 + - click==7.1.2 + - cython==0.29.20 + - et-xmlfile==1.0.1 + - etelemetry==0.2.1 + - filelock==3.0.12 + - isodate==0.6.0 + - jdcal==1.4.1 + - joblib==0.17.0 + - lxml==4.5.1 + - matplotlib==3.3.2 + - medpy==0.4.0 + - natsort==7.0.1 + - nibabel==3.1.0 + - nipype==1.5.0 + - openpyxl==3.0.4 + - prov==1.5.3 + - pydicom==2.0.0 + - pydot==1.4.1 + - pydotplus==2.0.2 + - pynrrd==0.4.2 + - rdflib==5.0.0 + - scikit-learn==0.23.2 + - scipy==1.5.3 + - setproctitle==1.1.10 + - simplejson==3.17.0 + - threadpoolctl==2.1.0 + - torch==1.4.0 + - torch-dwconv==0.1.0 + - torchvision==0.4.0 + - traits==6.1.0 +prefix: /home/jeyamariajose/anaconda3/envs/medt + diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/extractors.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/extractors.py new file mode 100644 index 0000000000000000000000000000000000000000..3f79b12882940bcfc6c1b951cf95f2c5e7e0620d --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/extractors.py @@ -0,0 +1,373 @@ +from collections import OrderedDict +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils import model_zoo +from torchvision.models.densenet import densenet121, densenet161 +from torchvision.models.squeezenet import squeezenet1_1 + + +def load_weights_sequential(target, source_state): + new_dict = OrderedDict() + for (k1, v1), (k2, v2) in zip(target.state_dict().items(), source_state.items()): + new_dict[k1] = v2 + target.load_state_dict(new_dict) + +''' + Implementation of dilated ResNet-101 with deep supervision. Downsampling is changed to 8x +''' +model_urls = { + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', +} + + +def conv3x3(in_planes, out_planes, stride=1, dilation=1): + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=dilation, dilation=dilation, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride=stride, dilation=dilation) + self.bn1 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes, stride=1, dilation=dilation) + self.bn2 = nn.BatchNorm2d(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, dilation=dilation, + padding=dilation, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNet(nn.Module): + def __init__(self, block, layers=(3, 4, 23, 3)): + self.inplanes = 64 + super(ResNet, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, + bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def _make_layer(self, block, planes, blocks, stride=1, dilation=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [block(self.inplanes, planes, stride, downsample)] + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes, dilation=dilation)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x_3 = self.layer3(x) + x = self.layer4(x_3) + + return x, x_3 + + +''' + Implementation of DenseNet with deep supervision. Downsampling is changed to 8x +''' + + +class _DenseLayer(nn.Sequential): + def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, index): + super(_DenseLayer, self).__init__() + self.add_module('norm1', nn.BatchNorm2d(num_input_features)), + self.add_module('relu1', nn.ReLU(inplace=True)), + if index == 3: + self.add_module('conv1', nn.Conv2d(num_input_features, bn_size * + growth_rate, kernel_size=1, stride=1, bias=False)), + self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)), + self.add_module('relu2', nn.ReLU(inplace=True)), + self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate, + kernel_size=3, stride=1, dilation=2, padding=2, bias=False)), + else: + self.add_module('conv1', nn.Conv2d(num_input_features, bn_size * + growth_rate, kernel_size=1, stride=1, bias=False)), + self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)), + self.add_module('relu2', nn.ReLU(inplace=True)), + self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate, + kernel_size=3, stride=1, padding=1, bias=False)), + self.drop_rate = drop_rate + + def forward(self, x): + new_features = super(_DenseLayer, self).forward(x) + if self.drop_rate > 0: + new_features = F.dropout(new_features, p=self.drop_rate, training=self.training) + return torch.cat([x, new_features], 1) + + +class _DenseBlock(nn.Sequential): + def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, index): + super(_DenseBlock, self).__init__() + for i in range(num_layers): + layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate, index) + self.add_module('denselayer%d' % (i + 1), layer) + + +class _Transition(nn.Sequential): + def __init__(self, num_input_features, num_output_features, downsample=True): + super(_Transition, self).__init__() + self.add_module('norm', nn.BatchNorm2d(num_input_features)) + self.add_module('relu', nn.ReLU(inplace=True)) + self.add_module('conv', nn.Conv2d(num_input_features, num_output_features, + kernel_size=1, stride=1, bias=False)) + if downsample: + self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2)) + else: + self.add_module('pool', nn.AvgPool2d(kernel_size=1, stride=1)) # compatibility hack + + +class DenseNet(nn.Module): + def __init__(self, growth_rate=8, block_config=(6, 12, 24, 16), + num_init_features=16, bn_size=4, drop_rate=0, pretrained=False): + + super(DenseNet, self).__init__() + + # First convolution + self.start_features = nn.Sequential(OrderedDict([ + ('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)), + ('norm0', nn.BatchNorm2d(num_init_features)), + ('relu0', nn.ReLU(inplace=True)), + ('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)), + ])) + + # Each denseblock + num_features = num_init_features + + init_weights = list(densenet121(pretrained=True).features.children()) + start = 0 + for i, c in enumerate(self.start_features.children()): + #if pretrained: + #c.load_state_dict(init_weights[i].state_dict()) + start += 1 + self.blocks = nn.ModuleList() + for i, num_layers in enumerate(block_config): + block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, + bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate, index = i) + if pretrained: + block.load_state_dict(init_weights[start].state_dict()) + start += 1 + self.blocks.append(block) + setattr(self, 'denseblock%d' % (i + 1), block) + + num_features = num_features + num_layers * growth_rate + if i != len(block_config) - 1: + downsample = i < 1 + trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2, + downsample=downsample) + if pretrained: + trans.load_state_dict(init_weights[start].state_dict()) + start += 1 + self.blocks.append(trans) + setattr(self, 'transition%d' % (i + 1), trans) + num_features = num_features // 2 + + def forward(self, x): + out = self.start_features(x) + deep_features = None + for i, block in enumerate(self.blocks): + out = block(out) + if i == 5: + deep_features = out + + return out, deep_features + + +class Fire(nn.Module): + + def __init__(self, inplanes, squeeze_planes, + expand1x1_planes, expand3x3_planes, dilation=1): + super(Fire, self).__init__() + self.inplanes = inplanes + self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1) + self.squeeze_activation = nn.ReLU(inplace=True) + self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes, + kernel_size=1) + self.expand1x1_activation = nn.ReLU(inplace=True) + self.expand3x3 = nn.Conv2d(squeeze_planes, expand3x3_planes, + kernel_size=3, padding=dilation, dilation=dilation) + self.expand3x3_activation = nn.ReLU(inplace=True) + + def forward(self, x): + x = self.squeeze_activation(self.squeeze(x)) + return torch.cat([ + self.expand1x1_activation(self.expand1x1(x)), + self.expand3x3_activation(self.expand3x3(x)) + ], 1) + + +class SqueezeNet(nn.Module): + + def __init__(self, pretrained=False): + super(SqueezeNet, self).__init__() + + self.feat_1 = nn.Sequential( + nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1), + nn.ReLU(inplace=True) + ) + self.feat_2 = nn.Sequential( + nn.MaxPool2d(kernel_size=3, stride=2, padding=1), + Fire(64, 16, 64, 64), + Fire(128, 16, 64, 64) + ) + self.feat_3 = nn.Sequential( + nn.MaxPool2d(kernel_size=3, stride=2, padding=1), + Fire(128, 32, 128, 128, 2), + Fire(256, 32, 128, 128, 2) + ) + self.feat_4 = nn.Sequential( + Fire(256, 48, 192, 192, 4), + Fire(384, 48, 192, 192, 4), + Fire(384, 64, 256, 256, 4), + Fire(512, 64, 256, 256, 4) + ) + if pretrained: + weights = squeezenet1_1(pretrained=True).features.state_dict() + load_weights_sequential(self, weights) + + def forward(self, x): + f1 = self.feat_1(x) + f2 = self.feat_2(f1) + f3 = self.feat_3(f2) + f4 = self.feat_4(f3) + return f4, f3 + + +''' + Handy methods for construction +''' + + +def squeezenet(pretrained=True): + return SqueezeNet(pretrained) + + +def densenet(pretrained=True): + return DenseNet(pretrained=pretrained) + + +def resnet18(pretrained=True): + model = ResNet(BasicBlock, [2, 2, 2, 2]) + if pretrained: + load_weights_sequential(model, model_zoo.load_url(model_urls['resnet18'])) + return model + + +def resnet34(pretrained=True): + model = ResNet(BasicBlock, [3, 4, 6, 3]) + if pretrained: + load_weights_sequential(model, model_zoo.load_url(model_urls['resnet34'])) + return model + + +def resnet50(pretrained=True): + model = ResNet(Bottleneck, [3, 4, 6, 3]) + if pretrained: + load_weights_sequential(model, model_zoo.load_url(model_urls['resnet50'])) + return model + + +def resnet101(pretrained=True): + model = ResNet(Bottleneck, [3, 4, 23, 3]) + if pretrained: + load_weights_sequential(model, model_zoo.load_url(model_urls['resnet101'])) + return model + + +def resnet152(pretrained=True): + model = ResNet(Bottleneck, [3, 8, 36, 3]) + if pretrained: + load_weights_sequential(model, model_zoo.load_url(model_urls['resnet152'])) + return model diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/img/arch.png b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/img/arch.png new file mode 100644 index 0000000000000000000000000000000000000000..d84791afdda09a0d95d9a6d312557dc09dd515d3 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/img/arch.png differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/img/medt.png b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/img/medt.png new file mode 100644 index 0000000000000000000000000000000000000000..d84791afdda09a0d95d9a6d312557dc09dd515d3 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/img/medt.png differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/img/medt1.png b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/img/medt1.png new file mode 100644 index 0000000000000000000000000000000000000000..c11b58abf88ecf2ea6e420cc62afe2683fc42e62 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/img/medt1.png differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/img/poster.pdf b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/img/poster.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e1576b1762ac42d689b9b5cd21da333c5b66ac67 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/img/poster.pdf differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__init__.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..05b870d489e93480b95ec238b0191c5b885d6b5d --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__init__.py @@ -0,0 +1,7 @@ +from .build_dataloader import build_dataloader +from .build_model import build_model +from .build_optimizer import build_optimizer +from .metrics import Metric + + +__all__ = ['build_dataloader', 'build_model', 'build_optimizer', 'Metric'] \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/__init__.cpython-36.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..626fa0a33129e07d3a3a79804b929d6e9a66700c Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/__init__.cpython-36.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/__init__.cpython-37.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26c2e4ceb062fc6e3085b0af00f8130c8964587a Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/__init__.cpython-37.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_dataloader.cpython-36.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_dataloader.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..706c56560bc0b60b333e7040fd76aa7979ff1886 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_dataloader.cpython-36.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_dataloader.cpython-37.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_dataloader.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d5b4e3a89a8e8242855c4ff186783033bbf5ae13 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_dataloader.cpython-37.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_model.cpython-36.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_model.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..83e5488245584d9e5d964c9ed93b458eebde5944 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_model.cpython-36.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_model.cpython-37.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_model.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6f6e6686988405262a8ce25a5d476ad6d855e08 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_model.cpython-37.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_optimizer.cpython-36.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_optimizer.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a8a1808145d702defe9b066d17150ce20311e8d Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_optimizer.cpython-36.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_optimizer.cpython-37.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_optimizer.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf340ff36aa067b16fa512eb56235adcb6a01453 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/build_optimizer.cpython-37.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/metrics.cpython-36.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/metrics.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ed2f6d0d01dd679f533519b2080ee0e3eb6bda4 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/metrics.cpython-36.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/metrics.cpython-37.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/metrics.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..768bc2fc98d8dd68175e0ac5733d08f5d82ca592 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/__pycache__/metrics.cpython-37.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/build_dataloader.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/build_dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..4fd58b24afb9372545eb580955e2ebc14f1a90a4 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/build_dataloader.py @@ -0,0 +1,5 @@ +from . import datasets + + +def build_dataloader(args, distributed=False): + return datasets.__dict__[args.dataset](args, distributed) diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/build_model.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/build_model.py new file mode 100644 index 0000000000000000000000000000000000000000..f8de6f781720d6db0174a40662941c15c6495808 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/build_model.py @@ -0,0 +1,6 @@ +from . import models + + +def build_model(args): + model = models.__dict__[args.model](num_classes=args.num_classes) + return model diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/build_optimizer.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/build_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..8ab69efc9f2285da5ab4619463e9153c379ce03a --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/build_optimizer.py @@ -0,0 +1,12 @@ +import torch.optim as optim + + +def build_optimizer(args, model): + if args.optim == 'sgd': + optimizer = optim.SGD(model.parameters(), lr=args.lr, + momentum=args.momentum, weight_decay=args.weight_decay, + nesterov=args.nesterov) + else: + raise AssertionError + return optimizer + diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/__init__.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7d4c38e378591d894a22335fe18dbd6b28c771ad --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/__init__.py @@ -0,0 +1,4 @@ +from .imagenet1k import imagenet1k + + +__all__ = ['imagenet1k'] diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/__pycache__/__init__.cpython-36.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90d03bfba52450b34e62841124c9637428d9ad79 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/__pycache__/__init__.cpython-36.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/__pycache__/__init__.cpython-37.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb7ba1f516ffc97a54c04ddd89c3b87bd3ec615d Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/__pycache__/__init__.cpython-37.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/__pycache__/imagenet1k.cpython-36.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/__pycache__/imagenet1k.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..402f6db9c829f2ddcc802ce9e61f337e46e327ac Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/__pycache__/imagenet1k.cpython-36.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/__pycache__/imagenet1k.cpython-37.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/__pycache__/imagenet1k.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79329fe2ca03ce87e888046a13d1c0f53cfa2b04 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/__pycache__/imagenet1k.cpython-37.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/imagenet1k.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/imagenet1k.py new file mode 100644 index 0000000000000000000000000000000000000000..d4ed14df7e0ea16bc4554d96397220e2e7706883 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/datasets/imagenet1k.py @@ -0,0 +1,56 @@ +import torch +import torchvision +from torchvision import datasets, transforms + + +def imagenet1k(args, distributed=False): + train_dirs = args.train_dirs + val_dirs = args.val_dirs + batch_size = args.batch_size + val_batch_size = args.val_batch_size + num_workers = args.num_workers + color_jitter = args.color_jitter + + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + process = [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + ] + if color_jitter: + process += [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1)] + process += [ + transforms.ToTensor(), + normalize + ] + + transform_train = transforms.Compose(process) + + train_set = datasets.ImageFolder(train_dirs, + transform=transform_train) + + if distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_set) + else: + train_sampler = None + + train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=(train_sampler is None), + sampler=train_sampler, num_workers=num_workers, pin_memory=True) + + transform_val = transforms.Compose( + [transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize]) + + val_set = datasets.ImageFolder(root=val_dirs, + transform=transform_val) + + if distributed: + val_sampler = torch.utils.data.distributed.DistributedSampler(val_set) + else: + val_sampler = None + + val_loader = torch.utils.data.DataLoader(val_set, batch_size=val_batch_size, shuffle=False, + sampler=val_sampler, num_workers=num_workers, pin_memory=True) + + return train_loader, train_sampler, val_loader, val_sampler diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/metrics.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..67fc4a54501a825e1c51b9e3a074fc77c9e23342 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/metrics.py @@ -0,0 +1,16 @@ +import torch + + +class Metric(object): + def __init__(self, name): + self.name = name + self.sum = torch.tensor(0.) + self.n = torch.tensor(0.) + + def update(self, val): + self.sum += val.detach().cpu() + self.n += 1 + + @property + def avg(self): + return self.sum / self.n \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__init__.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ad5caac364d96adc7dc9bb3889d145ddb69aa8c6 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__init__.py @@ -0,0 +1,3 @@ +from .resnet import * +from .axialnet import * +from .myaxialnet import * diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/__init__.cpython-36.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec420ea1bdcbbe6dae943ef2bef72a8624ef3c64 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/__init__.cpython-36.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/__init__.cpython-37.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e59002a9ac336d8aebbf4d9a4ff371a4f9a6aaee Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/__init__.cpython-37.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/axialnet.cpython-36.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/axialnet.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6445b4c9ce8026232e5c17efe90fa1ba6ca6961 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/axialnet.cpython-36.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/axialnet.cpython-37.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/axialnet.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60cfba896fb19500c251cba7b07c40bc124cfdc9 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/axialnet.cpython-37.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/resnet.cpython-36.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/resnet.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de0ac4e82f04f84742fb090d04a10b7bbe8a24f9 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/resnet.cpython-36.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/resnet.cpython-37.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/resnet.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..620b283c43b9d4cbcd8eb5bc97d086fc29acff94 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/resnet.cpython-37.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/utils.cpython-36.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..732f01108110a689804a66f196680de7ae9a9e1e Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/utils.cpython-36.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/utils.cpython-37.pyc b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..529fee973b2f9fb3a60ebae0703cbd540f59fd05 Binary files /dev/null and b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/__pycache__/utils.cpython-37.pyc differ diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/axialnet.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/axialnet.py new file mode 100644 index 0000000000000000000000000000000000000000..a258c82ae9c4b17dc2a429c67eac3e158128aaaa --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/axialnet.py @@ -0,0 +1,731 @@ +import pdb +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +from .utils import * +import pdb +import matplotlib.pyplot as plt + +import random + + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution""" + return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) + + +class AxialAttention(nn.Module): + def __init__(self, in_planes, out_planes, groups=8, kernel_size=56, + stride=1, bias=False, width=False): + assert (in_planes % groups == 0) and (out_planes % groups == 0) + super(AxialAttention, self).__init__() + self.in_planes = in_planes + self.out_planes = out_planes + self.groups = groups + self.group_planes = out_planes // groups + self.kernel_size = kernel_size + self.stride = stride + self.bias = bias + self.width = width + + # Multi-head self attention + self.qkv_transform = qkv_transform(in_planes, out_planes * 2, kernel_size=1, stride=1, + padding=0, bias=False) + self.bn_qkv = nn.BatchNorm1d(out_planes * 2) + self.bn_similarity = nn.BatchNorm2d(groups * 3) + + self.bn_output = nn.BatchNorm1d(out_planes * 2) + + # Position embedding + self.relative = nn.Parameter(torch.randn(self.group_planes * 2, kernel_size * 2 - 1), requires_grad=True) + query_index = torch.arange(kernel_size).unsqueeze(0) + key_index = torch.arange(kernel_size).unsqueeze(1) + relative_index = key_index - query_index + kernel_size - 1 + self.register_buffer('flatten_index', relative_index.view(-1)) + if stride > 1: + self.pooling = nn.AvgPool2d(stride, stride=stride) + + self.reset_parameters() + + def forward(self, x): + # pdb.set_trace() + if self.width: + x = x.permute(0, 2, 1, 3) + else: + x = x.permute(0, 3, 1, 2) # N, W, C, H + N, W, C, H = x.shape + x = x.contiguous().view(N * W, C, H) + + # Transformations + qkv = self.bn_qkv(self.qkv_transform(x)) + q, k, v = torch.split(qkv.reshape(N * W, self.groups, self.group_planes * 2, H), [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=2) + + # Calculate position embedding + all_embeddings = torch.index_select(self.relative, 1, self.flatten_index).view(self.group_planes * 2, self.kernel_size, self.kernel_size) + q_embedding, k_embedding, v_embedding = torch.split(all_embeddings, [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=0) + + qr = torch.einsum('bgci,cij->bgij', q, q_embedding) + kr = torch.einsum('bgci,cij->bgij', k, k_embedding).transpose(2, 3) + + qk = torch.einsum('bgci, bgcj->bgij', q, k) + + stacked_similarity = torch.cat([qk, qr, kr], dim=1) + stacked_similarity = self.bn_similarity(stacked_similarity).view(N * W, 3, self.groups, H, H).sum(dim=1) + #stacked_similarity = self.bn_qr(qr) + self.bn_kr(kr) + self.bn_qk(qk) + # (N, groups, H, H, W) + similarity = F.softmax(stacked_similarity, dim=3) + sv = torch.einsum('bgij,bgcj->bgci', similarity, v) + sve = torch.einsum('bgij,cij->bgci', similarity, v_embedding) + stacked_output = torch.cat([sv, sve], dim=-1).view(N * W, self.out_planes * 2, H) + output = self.bn_output(stacked_output).view(N, W, self.out_planes, 2, H).sum(dim=-2) + + if self.width: + output = output.permute(0, 2, 1, 3) + else: + output = output.permute(0, 2, 3, 1) + + if self.stride > 1: + output = self.pooling(output) + + return output + + def reset_parameters(self): + self.qkv_transform.weight.data.normal_(0, math.sqrt(1. / self.in_planes)) + #nn.init.uniform_(self.relative, -0.1, 0.1) + nn.init.normal_(self.relative, 0., math.sqrt(1. / self.group_planes)) + +class AxialAttention_dynamic(nn.Module): + def __init__(self, in_planes, out_planes, groups=8, kernel_size=56, + stride=1, bias=False, width=False): + assert (in_planes % groups == 0) and (out_planes % groups == 0) + super(AxialAttention_dynamic, self).__init__() + self.in_planes = in_planes + self.out_planes = out_planes + self.groups = groups + self.group_planes = out_planes // groups + self.kernel_size = kernel_size + self.stride = stride + self.bias = bias + self.width = width + + # Multi-head self attention + self.qkv_transform = qkv_transform(in_planes, out_planes * 2, kernel_size=1, stride=1, + padding=0, bias=False) + self.bn_qkv = nn.BatchNorm1d(out_planes * 2) + self.bn_similarity = nn.BatchNorm2d(groups * 3) + self.bn_output = nn.BatchNorm1d(out_planes * 2) + + # Priority on encoding + + ## Initial values + + self.f_qr = nn.Parameter(torch.tensor(0.1), requires_grad=False) + self.f_kr = nn.Parameter(torch.tensor(0.1), requires_grad=False) + self.f_sve = nn.Parameter(torch.tensor(0.1), requires_grad=False) + self.f_sv = nn.Parameter(torch.tensor(1.0), requires_grad=False) + + + # Position embedding + self.relative = nn.Parameter(torch.randn(self.group_planes * 2, kernel_size * 2 - 1), requires_grad=True) + query_index = torch.arange(kernel_size).unsqueeze(0) + key_index = torch.arange(kernel_size).unsqueeze(1) + relative_index = key_index - query_index + kernel_size - 1 + self.register_buffer('flatten_index', relative_index.view(-1)) + if stride > 1: + self.pooling = nn.AvgPool2d(stride, stride=stride) + + self.reset_parameters() + # self.print_para() + + def forward(self, x): + if self.width: + x = x.permute(0, 2, 1, 3) + else: + x = x.permute(0, 3, 1, 2) # N, W, C, H + N, W, C, H = x.shape + x = x.contiguous().view(N * W, C, H) + + # Transformations + qkv = self.bn_qkv(self.qkv_transform(x)) + q, k, v = torch.split(qkv.reshape(N * W, self.groups, self.group_planes * 2, H), [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=2) + + # Calculate position embedding + all_embeddings = torch.index_select(self.relative, 1, self.flatten_index).view(self.group_planes * 2, self.kernel_size, self.kernel_size) + q_embedding, k_embedding, v_embedding = torch.split(all_embeddings, [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=0) + qr = torch.einsum('bgci,cij->bgij', q, q_embedding) + kr = torch.einsum('bgci,cij->bgij', k, k_embedding).transpose(2, 3) + qk = torch.einsum('bgci, bgcj->bgij', q, k) + + + # multiply by factors + qr = torch.mul(qr, self.f_qr) + kr = torch.mul(kr, self.f_kr) + + stacked_similarity = torch.cat([qk, qr, kr], dim=1) + stacked_similarity = self.bn_similarity(stacked_similarity).view(N * W, 3, self.groups, H, H).sum(dim=1) + #stacked_similarity = self.bn_qr(qr) + self.bn_kr(kr) + self.bn_qk(qk) + # (N, groups, H, H, W) + similarity = F.softmax(stacked_similarity, dim=3) + sv = torch.einsum('bgij,bgcj->bgci', similarity, v) + sve = torch.einsum('bgij,cij->bgci', similarity, v_embedding) + + # multiply by factors + sv = torch.mul(sv, self.f_sv) + sve = torch.mul(sve, self.f_sve) + + stacked_output = torch.cat([sv, sve], dim=-1).view(N * W, self.out_planes * 2, H) + output = self.bn_output(stacked_output).view(N, W, self.out_planes, 2, H).sum(dim=-2) + + if self.width: + output = output.permute(0, 2, 1, 3) + else: + output = output.permute(0, 2, 3, 1) + + if self.stride > 1: + output = self.pooling(output) + + return output + def reset_parameters(self): + self.qkv_transform.weight.data.normal_(0, math.sqrt(1. / self.in_planes)) + #nn.init.uniform_(self.relative, -0.1, 0.1) + nn.init.normal_(self.relative, 0., math.sqrt(1. / self.group_planes)) + +class AxialAttention_wopos(nn.Module): + def __init__(self, in_planes, out_planes, groups=8, kernel_size=56, + stride=1, bias=False, width=False): + assert (in_planes % groups == 0) and (out_planes % groups == 0) + super(AxialAttention_wopos, self).__init__() + self.in_planes = in_planes + self.out_planes = out_planes + self.groups = groups + self.group_planes = out_planes // groups + self.kernel_size = kernel_size + self.stride = stride + self.bias = bias + self.width = width + + # Multi-head self attention + self.qkv_transform = qkv_transform(in_planes, out_planes * 2, kernel_size=1, stride=1, + padding=0, bias=False) + self.bn_qkv = nn.BatchNorm1d(out_planes * 2) + self.bn_similarity = nn.BatchNorm2d(groups ) + + self.bn_output = nn.BatchNorm1d(out_planes * 1) + + if stride > 1: + self.pooling = nn.AvgPool2d(stride, stride=stride) + + self.reset_parameters() + + def forward(self, x): + if self.width: + x = x.permute(0, 2, 1, 3) + else: + x = x.permute(0, 3, 1, 2) # N, W, C, H + N, W, C, H = x.shape + x = x.contiguous().view(N * W, C, H) + + # Transformations + qkv = self.bn_qkv(self.qkv_transform(x)) + q, k, v = torch.split(qkv.reshape(N * W, self.groups, self.group_planes * 2, H), [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=2) + + qk = torch.einsum('bgci, bgcj->bgij', q, k) + + stacked_similarity = self.bn_similarity(qk).reshape(N * W, 1, self.groups, H, H).sum(dim=1).contiguous() + + similarity = F.softmax(stacked_similarity, dim=3) + sv = torch.einsum('bgij,bgcj->bgci', similarity, v) + + sv = sv.reshape(N*W,self.out_planes * 1, H).contiguous() + output = self.bn_output(sv).reshape(N, W, self.out_planes, 1, H).sum(dim=-2).contiguous() + + + if self.width: + output = output.permute(0, 2, 1, 3) + else: + output = output.permute(0, 2, 3, 1) + + if self.stride > 1: + output = self.pooling(output) + + return output + + def reset_parameters(self): + self.qkv_transform.weight.data.normal_(0, math.sqrt(1. / self.in_planes)) + #nn.init.uniform_(self.relative, -0.1, 0.1) + # nn.init.normal_(self.relative, 0., math.sqrt(1. / self.group_planes)) + +# end of attn definition + +class AxialBlock(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None, kernel_size=56): + super(AxialBlock, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + width = int(planes * (base_width / 64.)) + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv_down = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.hight_block = AxialAttention(width, width, groups=groups, kernel_size=kernel_size) + self.width_block = AxialAttention(width, width, groups=groups, kernel_size=kernel_size, stride=stride, width=True) + self.conv_up = conv1x1(width, planes * self.expansion) + self.bn2 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv_down(x) # 下采样 inplanes -> width + out = self.bn1(out) + out = self.relu(out) + # print(out.shape) + out = self.hight_block(out) + out = self.width_block(out) + out = self.relu(out) + + out = self.conv_up(out) # width -> planes*2(expansion) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + +class AxialBlock_dynamic(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None, kernel_size=56): + super(AxialBlock_dynamic, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + width = int(planes * (base_width / 64.)) + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv_down = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.hight_block = AxialAttention_dynamic(width, width, groups=groups, kernel_size=kernel_size) + self.width_block = AxialAttention_dynamic(width, width, groups=groups, kernel_size=kernel_size, stride=stride, width=True) + self.conv_up = conv1x1(width, planes * self.expansion) + self.bn2 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv_down(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.hight_block(out) + out = self.width_block(out) + out = self.relu(out) + + out = self.conv_up(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + +class AxialBlock_wopos(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None, kernel_size=56): + super(AxialBlock_wopos, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + # print(kernel_size) + width = int(planes * (base_width / 64.)) + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv_down = conv1x1(inplanes, width) + self.conv1 = nn.Conv2d(width, width, kernel_size = 1) + self.bn1 = norm_layer(width) + self.hight_block = AxialAttention_wopos(width, width, groups=groups, kernel_size=kernel_size) + self.width_block = AxialAttention_wopos(width, width, groups=groups, kernel_size=kernel_size, stride=stride, width=True) + self.conv_up = conv1x1(width, planes * self.expansion) + self.bn2 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + # pdb.set_trace() + + out = self.conv_down(x) + out = self.bn1(out) + out = self.relu(out) + # print(out.shape) + out = self.hight_block(out) + out = self.width_block(out) + + out = self.relu(out) + + out = self.conv_up(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + + +#end of block definition + + +class ResAxialAttentionUNet(nn.Module): + + def __init__(self, block, layers, num_classes=2, zero_init_residual=True, + groups=8, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None, s=0.125, img_size = 128,imgchan = 3): + super(ResAxialAttentionUNet, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = int(64 * s) + self.dilation = 1 + if replace_stride_with_dilation is None: + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + self.conv2 = nn.Conv2d(self.inplanes, 128, kernel_size=3, stride=1, padding=1, bias=False) + self.conv3 = nn.Conv2d(128, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = norm_layer(self.inplanes) + self.bn2 = norm_layer(128) + self.bn3 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, int(128 * s), layers[0], kernel_size= (img_size//2)) + self.layer2 = self._make_layer(block, int(256 * s), layers[1], stride=2, kernel_size=(img_size//2), + dilate=replace_stride_with_dilation[0]) + self.layer3 = self._make_layer(block, int(512 * s), layers[2], stride=2, kernel_size=(img_size//4), + dilate=replace_stride_with_dilation[1]) + self.layer4 = self._make_layer(block, int(1024 * s), layers[3], stride=2, kernel_size=(img_size//8), + dilate=replace_stride_with_dilation[2]) + + # Decoder + self.decoder1 = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) + self.decoder2 = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) + self.decoder3 = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) + self.decoder4 = nn.Conv2d(int(512*s) , int(256*s), kernel_size=3, stride=1, padding=1) + self.decoder5 = nn.Conv2d(int(256*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + self.adjust = nn.Conv2d(int(128*s) , num_classes, kernel_size=1, stride=1, padding=0) + self.soft = nn.Softmax(dim=1) + + + def _make_layer(self, block, planes, blocks, kernel_size=56, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, groups=self.groups, + base_width=self.base_width, dilation=previous_dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + self.inplanes = planes * block.expansion + if stride != 1: + kernel_size = kernel_size // 2 + + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + + return nn.Sequential(*layers) + + def _forward_impl(self, x): + + # AxialAttention Encoder + # pdb.set_trace() + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + x = self.conv3(x) + x = self.bn3(x) + x = self.relu(x) + + x1 = self.layer1(x) + + x2 = self.layer2(x1) + # print(x2.shape) + x3 = self.layer3(x2) + # print(x3.shape) + x4 = self.layer4(x3) + + x = F.relu(F.interpolate(self.decoder1(x4), scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x4) + x = F.relu(F.interpolate(self.decoder2(x) , scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x3) + x = F.relu(F.interpolate(self.decoder3(x) , scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x2) + x = F.relu(F.interpolate(self.decoder4(x) , scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x1) + x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) + x = self.adjust(F.relu(x)) + # pdb.set_trace() + return x + + def forward(self, x): + return self._forward_impl(x) + +class medt_net(nn.Module): + + def __init__(self, block, block_2, layers, num_classes=2, zero_init_residual=True, + groups=8, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None, s=0.125, img_size = 128,imgchan = 3): + super(medt_net, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = int(64 * s) # 64*0.125=8 + self.dilation = 1 + if replace_stride_with_dilation is None: + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups # 8 + self.base_width = width_per_group # 64 + self.conv1 = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) # (h-7+6/2)+1=h/2 + self.conv2 = nn.Conv2d(self.inplanes, 128, kernel_size=3, stride=1, padding=1, bias=False) # 尺寸不变 + self.conv3 = nn.Conv2d(128, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) # 尺寸不变 + self.bn1 = norm_layer(self.inplanes) + self.bn2 = norm_layer(128) + self.bn3 = norm_layer(self.inplanes) + # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, int(128 * s), layers[0], kernel_size= (img_size//2)) + self.layer2 = self._make_layer(block, int(256 * s), layers[1], stride=2, kernel_size=(img_size//2), + dilate=replace_stride_with_dilation[0]) + # self.layer3 = self._make_layer(block, int(512 * s), layers[2], stride=2, kernel_size=(img_size//4), + # dilate=replace_stride_with_dilation[1]) + # self.layer4 = self._make_layer(block, int(1024 * s), layers[3], stride=2, kernel_size=(img_size//8), + # dilate=replace_stride_with_dilation[2]) + + # Decoder + # self.decoder1 = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) + # self.decoder2 = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) + # self.decoder3 = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) + self.decoder4 = nn.Conv2d(int(512*s) , int(256*s), kernel_size=3, stride=1, padding=1) + self.decoder5 = nn.Conv2d(int(256*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + self.adjust = nn.Conv2d(int(128*s) , num_classes, kernel_size=1, stride=1, padding=0) + self.soft = nn.Softmax(dim=1) + + + self.conv1_p = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + self.conv2_p = nn.Conv2d(self.inplanes,128, kernel_size=3, stride=1, padding=1, + bias=False) + self.conv3_p = nn.Conv2d(128, self.inplanes, kernel_size=3, stride=1, padding=1, + bias=False) + # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1_p = norm_layer(self.inplanes) + self.bn2_p = norm_layer(128) + self.bn3_p = norm_layer(self.inplanes) + + self.relu_p = nn.ReLU(inplace=True) + + img_size_p = img_size // 4 + + self.layer1_p = self._make_layer(block_2, int(128 * s), layers[0], kernel_size= (img_size_p//2)) + self.layer2_p = self._make_layer(block_2, int(256 * s), layers[1], stride=2, kernel_size=(img_size_p//2), + dilate=replace_stride_with_dilation[0]) + self.layer3_p = self._make_layer(block_2, int(512 * s), layers[2], stride=2, kernel_size=(img_size_p//4), + dilate=replace_stride_with_dilation[1]) + self.layer4_p = self._make_layer(block_2, int(1024 * s), layers[3], stride=2, kernel_size=(img_size_p//8), + dilate=replace_stride_with_dilation[2]) + + # Decoder + self.decoder1_p = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) + self.decoder2_p = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) + self.decoder3_p = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) + self.decoder4_p = nn.Conv2d(int(512*s) , int(256*s), kernel_size=3, stride=1, padding=1) + self.decoder5_p = nn.Conv2d(int(256*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + + self.decoderf = nn.Conv2d(int(128*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + self.adjust_p = nn.Conv2d(int(128*s) , num_classes, kernel_size=1, stride=1, padding=0) + self.soft_p = nn.Softmax(dim=1) + + + def _make_layer(self, block, planes, blocks, kernel_size=56, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, groups=self.groups, + base_width=self.base_width, dilation=previous_dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + self.inplanes = planes * block.expansion + if stride != 1: + kernel_size = kernel_size // 2 + + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + + return nn.Sequential(*layers) + + def _forward_impl(self, x): + + xin = x.clone() + x = self.conv1(x) # 3-> inplanes + x = self.bn1(x) + x = self.relu(x) + x = self.conv2(x) # inplanes -> 128 + x = self.bn2(x) + x = self.relu(x) + x = self.conv3(x) # 128 -> inplanes + x = self.bn3(x) + # x = F.max_pool2d(x,2,2) + x = self.relu(x) + # print('x shape:', x.shape) + + # x = self.maxpool(x) + # pdb.set_trace() + x1 = self.layer1(x) # inplanes -> 128*s*2 inplanes在layers里面会乘以2 inplanes变为 planes*2(expansion) + # print(x1.shape) + x2 = self.layer2(x1) # 128*s*2 -> 256*s*2 inplances:256*s->256*s*2 + # print(x2.shape) + # x3 = self.layer3(x2) + # # print(x3.shape) + # x4 = self.layer4(x3) + # # print(x4.shape) + # x = F.relu(F.interpolate(self.decoder1(x4), scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x4) + # x = F.relu(F.interpolate(self.decoder2(x4) , scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x3) + # x = F.relu(F.interpolate(self.decoder3(x3) , scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x2) + x = F.relu(F.interpolate(self.decoder4(x2) , scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x1) + x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) # 256*s->128*s->256*s + # print(x.shape) + + # end of full image training + + # y_out = torch.ones((1,2,128,128)) + x_loc = x.clone() + # x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) + #start + for i in range(0,4): + for j in range(0,4): + + x_p = xin[:,:,32*i:32*(i+1),32*j:32*(j+1)] # 分patch对每一patch进行transformer,循环操作。 + # begin patch wise + x_p = self.conv1_p(x_p) # imgchans -> inplanes 3>512*s + x_p = self.bn1_p(x_p) + # x = F.max_pool2d(x,2,2) + x_p = self.relu(x_p) + + x_p = self.conv2_p(x_p) # inplanes -> 128 + x_p = self.bn2_p(x_p) + # x = F.max_pool2d(x,2,2) + x_p = self.relu(x_p) + x_p = self.conv3_p(x_p) # 128->inplanes + x_p = self.bn3_p(x_p) + # x = F.max_pool2d(x,2,2) + x_p = self.relu(x_p) + + # x = self.maxpool(x) + # pdb.set_trace() + x1_p = self.layer1_p(x_p) # inplanes ->128*s*2, inplanes: 512*s->128*s*2 + # print(x1.shape) + x2_p = self.layer2_p(x1_p) # 256*s*2 + # print(x2.shape) + x3_p = self.layer3_p(x2_p) # 512*s*2 + # # print(x3.shape) + x4_p = self.layer4_p(x3_p) # 1024*s*2 + + x_p = F.relu(F.interpolate(self.decoder1_p(x4_p), scale_factor=(2,2), mode ='bilinear')) # 1024*s*2 + x_p = torch.add(x_p, x4_p) + x_p = F.relu(F.interpolate(self.decoder2_p(x_p) , scale_factor=(2,2), mode ='bilinear')) # 1024*s + x_p = torch.add(x_p, x3_p) + x_p = F.relu(F.interpolate(self.decoder3_p(x_p) , scale_factor=(2,2), mode ='bilinear')) # 512*s + x_p = torch.add(x_p, x2_p) + x_p = F.relu(F.interpolate(self.decoder4_p(x_p) , scale_factor=(2,2), mode ='bilinear')) # 256*s + x_p = torch.add(x_p, x1_p) + x_p = F.relu(F.interpolate(self.decoder5_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + + x_loc[:,:,32*i:32*(i+1),32*j:32*(j+1)] = x_p + + x = torch.add(x,x_loc) + x = F.relu(self.decoderf(x)) # 128*s->128*s + + x = self.adjust(F.relu(x)) # 128*s -> classes + + # pdb.set_trace() + return x + + def forward(self, x): + return self._forward_impl(x) + + +def axialunet(pretrained=False, **kwargs): + model = ResAxialAttentionUNet(AxialBlock, [1, 2, 4, 1], s= 0.125, **kwargs) + return model + +def gated(pretrained=False, **kwargs): + model = ResAxialAttentionUNet(AxialBlock_dynamic, [1, 2, 4, 1], s= 0.125, **kwargs) + return model + +def MedT(pretrained=False, **kwargs): + model = medt_net(AxialBlock_dynamic,AxialBlock_wopos, [1, 2, 4, 1], s= 0.125, **kwargs) + return model + +def logo(pretrained=False, **kwargs): + model = medt_net(AxialBlock,AxialBlock, [1, 2, 4, 1], s= 0.125, **kwargs) + return model + +# EOF \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/model_codes.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/model_codes.py new file mode 100644 index 0000000000000000000000000000000000000000..6e517c234f96f01e7d08acfc4482f82378751437 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/model_codes.py @@ -0,0 +1,2324 @@ +import pdb +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +from .utils import * +import pdb +import matplotlib.pyplot as plt + +import random + +__all__ = ['axial26s', 'axial50s', 'axial50m', 'axial50l'] + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution""" + return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) + + +class AxialAttention(nn.Module): + def __init__(self, in_planes, out_planes, groups=8, kernel_size=56, + stride=1, bias=False, width=False): + assert (in_planes % groups == 0) and (out_planes % groups == 0) + super(AxialAttention, self).__init__() + self.in_planes = in_planes + self.out_planes = out_planes + self.groups = groups + self.group_planes = out_planes // groups + self.kernel_size = kernel_size + self.stride = stride + self.bias = bias + self.width = width + + # Multi-head self attention + self.qkv_transform = qkv_transform(in_planes, out_planes * 2, kernel_size=1, stride=1, + padding=0, bias=False) + self.bn_qkv = nn.BatchNorm1d(out_planes * 2) + self.bn_similarity = nn.BatchNorm2d(groups * 3) + #self.bn_qk = nn.BatchNorm2d(groups) + #self.bn_qr = nn.BatchNorm2d(groups) + #self.bn_kr = nn.BatchNorm2d(groups) + self.bn_output = nn.BatchNorm1d(out_planes * 2) + + # Position embedding + self.relative = nn.Parameter(torch.randn(self.group_planes * 2, kernel_size * 2 - 1), requires_grad=True) + query_index = torch.arange(kernel_size).unsqueeze(0) + key_index = torch.arange(kernel_size).unsqueeze(1) + relative_index = key_index - query_index + kernel_size - 1 + self.register_buffer('flatten_index', relative_index.view(-1)) + if stride > 1: + self.pooling = nn.AvgPool2d(stride, stride=stride) + + self.reset_parameters() + + def forward(self, x): + # pdb.set_trace() + if self.width: + x = x.permute(0, 2, 1, 3) + else: + x = x.permute(0, 3, 1, 2) # N, W, C, H + N, W, C, H = x.shape + x = x.contiguous().view(N * W, C, H) + + # Transformations + qkv = self.bn_qkv(self.qkv_transform(x)) + q, k, v = torch.split(qkv.reshape(N * W, self.groups, self.group_planes * 2, H), [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=2) + + # Calculate position embedding + all_embeddings = torch.index_select(self.relative, 1, self.flatten_index).view(self.group_planes * 2, self.kernel_size, self.kernel_size) + q_embedding, k_embedding, v_embedding = torch.split(all_embeddings, [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=0) + + rd = random.randint(0,100) + qqn = q_embedding[0].detach().cpu().numpy() + plt.imshow(qqn) + plt.savefig("glas/q/%d.png"%rd) + + kqn = k_embedding[0].detach().cpu().numpy() + plt.imshow(kqn) + plt.savefig("glas/k/%d.png"%rd) + + vqn = v_embedding[0].detach().cpu().numpy() + plt.imshow(vqn) + plt.savefig("glas/v/%d.png"%rd) + + qr = torch.einsum('bgci,cij->bgij', q, q_embedding) + kr = torch.einsum('bgci,cij->bgij', k, k_embedding).transpose(2, 3) + + qk = torch.einsum('bgci, bgcj->bgij', q, k) + # print(qk.shape, qr.shape, kr.shape) + # import pdb + # pdb.set_trace() + stacked_similarity = torch.cat([qk, qr, kr], dim=1) + stacked_similarity = self.bn_similarity(stacked_similarity).view(N * W, 3, self.groups, H, H).sum(dim=1) + #stacked_similarity = self.bn_qr(qr) + self.bn_kr(kr) + self.bn_qk(qk) + # (N, groups, H, H, W) + similarity = F.softmax(stacked_similarity, dim=3) + sv = torch.einsum('bgij,bgcj->bgci', similarity, v) + sve = torch.einsum('bgij,cij->bgci', similarity, v_embedding) + stacked_output = torch.cat([sv, sve], dim=-1).view(N * W, self.out_planes * 2, H) + output = self.bn_output(stacked_output).view(N, W, self.out_planes, 2, H).sum(dim=-2) + + if self.width: + output = output.permute(0, 2, 1, 3) + else: + output = output.permute(0, 2, 3, 1) + + if self.stride > 1: + output = self.pooling(output) + + return output + + def reset_parameters(self): + self.qkv_transform.weight.data.normal_(0, math.sqrt(1. / self.in_planes)) + #nn.init.uniform_(self.relative, -0.1, 0.1) + nn.init.normal_(self.relative, 0., math.sqrt(1. / self.group_planes)) + +class AxialAttention_dynamic(nn.Module): + def __init__(self, in_planes, out_planes, groups=8, kernel_size=56, + stride=1, bias=False, width=False): + assert (in_planes % groups == 0) and (out_planes % groups == 0) + super(AxialAttention_dynamic, self).__init__() + self.in_planes = in_planes + self.out_planes = out_planes + self.groups = groups + self.group_planes = out_planes // groups + self.kernel_size = kernel_size + self.stride = stride + self.bias = bias + self.width = width + + # Multi-head self attention + self.qkv_transform = qkv_transform(in_planes, out_planes * 2, kernel_size=1, stride=1, + padding=0, bias=False) + self.bn_qkv = nn.BatchNorm1d(out_planes * 2) + self.bn_similarity = nn.BatchNorm2d(groups * 3) + #self.bn_qk = nn.BatchNorm2d(groups) + #self.bn_qr = nn.BatchNorm2d(groups) + #self.bn_kr = nn.BatchNorm2d(groups) + self.bn_output = nn.BatchNorm1d(out_planes * 2) + + # Priority on encoding + + self.f_qr = nn.Parameter(torch.tensor(1.0), requires_grad=False) + self.f_kr = nn.Parameter(torch.tensor(1.0), requires_grad=False) + self.f_sve = nn.Parameter(torch.tensor(1.0), requires_grad=False) + self.f_sv = nn.Parameter(torch.tensor(1.0), requires_grad=False) + + # Position embedding + self.relative = nn.Parameter(torch.randn(self.group_planes * 2, kernel_size * 2 - 1), requires_grad=True) + query_index = torch.arange(kernel_size).unsqueeze(0) + key_index = torch.arange(kernel_size).unsqueeze(1) + relative_index = key_index - query_index + kernel_size - 1 + self.register_buffer('flatten_index', relative_index.view(-1)) + if stride > 1: + self.pooling = nn.AvgPool2d(stride, stride=stride) + + self.reset_parameters() + # self.print_para() + + def forward(self, x): + if self.width: + x = x.permute(0, 2, 1, 3) + else: + x = x.permute(0, 3, 1, 2) # N, W, C, H + N, W, C, H = x.shape + x = x.contiguous().view(N * W, C, H) + + # Transformations + qkv = self.bn_qkv(self.qkv_transform(x)) + q, k, v = torch.split(qkv.reshape(N * W, self.groups, self.group_planes * 2, H), [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=2) + + # Calculate position embedding + all_embeddings = torch.index_select(self.relative, 1, self.flatten_index).view(self.group_planes * 2, self.kernel_size, self.kernel_size) + q_embedding, k_embedding, v_embedding = torch.split(all_embeddings, [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=0) + qr = torch.einsum('bgci,cij->bgij', q, q_embedding) + kr = torch.einsum('bgci,cij->bgij', k, k_embedding).transpose(2, 3) + qk = torch.einsum('bgci, bgcj->bgij', q, k) + # print(qk.shape, qr.shape, kr.shape) + # import pdb + # pdb.set_trace() + + # multiply by factors + qr = torch.mul(qr, self.f_qr) + kr = torch.mul(kr, self.f_kr) + + stacked_similarity = torch.cat([qk, qr, kr], dim=1) + stacked_similarity = self.bn_similarity(stacked_similarity).view(N * W, 3, self.groups, H, H).sum(dim=1) + #stacked_similarity = self.bn_qr(qr) + self.bn_kr(kr) + self.bn_qk(qk) + # (N, groups, H, H, W) + similarity = F.softmax(stacked_similarity, dim=3) + sv = torch.einsum('bgij,bgcj->bgci', similarity, v) + sve = torch.einsum('bgij,cij->bgci', similarity, v_embedding) + + # multiply by factors + sv = torch.mul(sv, self.f_sv) + sve = torch.mul(sve, self.f_sve) + + stacked_output = torch.cat([sv, sve], dim=-1).view(N * W, self.out_planes * 2, H) + output = self.bn_output(stacked_output).view(N, W, self.out_planes, 2, H).sum(dim=-2) + + if self.width: + output = output.permute(0, 2, 1, 3) + else: + output = output.permute(0, 2, 3, 1) + + if self.stride > 1: + output = self.pooling(output) + + return output + def reset_parameters(self): + self.qkv_transform.weight.data.normal_(0, math.sqrt(1. / self.in_planes)) + #nn.init.uniform_(self.relative, -0.1, 0.1) + nn.init.normal_(self.relative, 0., math.sqrt(1. / self.group_planes)) + +class AxialAttention_gated_sig(nn.Module): + def __init__(self, in_planes, out_planes, groups=8, kernel_size=56, + stride=1, bias=False, width=False): + assert (in_planes % groups == 0) and (out_planes % groups == 0) + super(AxialAttention_gated_sig, self).__init__() + self.in_planes = in_planes + self.out_planes = out_planes + self.groups = groups + self.group_planes = out_planes // groups + self.kernel_size = kernel_size + self.stride = stride + self.bias = bias + self.width = width + + # Multi-head self attention + self.qkv_transform = qkv_transform(in_planes, out_planes * 2, kernel_size=1, stride=1, + padding=0, bias=False) + self.bn_qkv = nn.BatchNorm1d(out_planes * 2) + self.bn_similarity = nn.BatchNorm2d(groups * 3) + #self.bn_qk = nn.BatchNorm2d(groups) + #self.bn_qr = nn.BatchNorm2d(groups) + #self.bn_kr = nn.BatchNorm2d(groups) + self.bn_output = nn.BatchNorm1d(out_planes * 2) + + # Priority on encoding + + self.f_qr = nn.Parameter(torch.tensor(0.1), requires_grad=False) + self.f_kr = nn.Parameter(torch.tensor(0.1), requires_grad=False) + self.f_sve = nn.Parameter(torch.tensor(0.1), requires_grad=False) + self.f_sv = nn.Parameter(torch.tensor(5.0), requires_grad=False) + + # Position embedding + self.relative = nn.Parameter(torch.randn(self.group_planes * 2, kernel_size * 2 - 1), requires_grad=True) + query_index = torch.arange(kernel_size).unsqueeze(0) + key_index = torch.arange(kernel_size).unsqueeze(1) + relative_index = key_index - query_index + kernel_size - 1 + self.register_buffer('flatten_index', relative_index.view(-1)) + if stride > 1: + self.pooling = nn.AvgPool2d(stride, stride=stride) + + self.reset_parameters() + # self.print_para() + + def forward(self, x): + if self.width: + x = x.permute(0, 2, 1, 3) + else: + x = x.permute(0, 3, 1, 2) # N, W, C, H + N, W, C, H = x.shape + x = x.contiguous().view(N * W, C, H) + + # Transformations + qkv = self.bn_qkv(self.qkv_transform(x)) + q, k, v = torch.split(qkv.reshape(N * W, self.groups, self.group_planes * 2, H), [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=2) + + # Calculate position embedding + all_embeddings = torch.index_select(self.relative, 1, self.flatten_index).view(self.group_planes * 2, self.kernel_size, self.kernel_size) + q_embedding, k_embedding, v_embedding = torch.split(all_embeddings, [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=0) + qr = torch.einsum('bgci,cij->bgij', q, q_embedding) + kr = torch.einsum('bgci,cij->bgij', k, k_embedding).transpose(2, 3) + qk = torch.einsum('bgci, bgcj->bgij', q, k) + # print(qk.shape, qr.shape, kr.shape) + # import pdb + # pdb.set_trace() + + # multiply by factors + qr = torch.mul(qr, torch.sigmoid(self.f_qr)) + kr = torch.mul(kr, torch.sigmoid(self.f_kr)) + + stacked_similarity = torch.cat([qk, qr, kr], dim=1) + stacked_similarity = self.bn_similarity(stacked_similarity).view(N * W, 3, self.groups, H, H).sum(dim=1) + #stacked_similarity = self.bn_qr(qr) + self.bn_kr(kr) + self.bn_qk(qk) + # (N, groups, H, H, W) + similarity = F.softmax(stacked_similarity, dim=3) + sv = torch.einsum('bgij,bgcj->bgci', similarity, v) + sve = torch.einsum('bgij,cij->bgci', similarity, v_embedding) + + # multiply by factors + sv = torch.mul(sv, torch.sigmoid(self.f_sv)) + sve = torch.mul(sve, torch.sigmoid(self.f_sve)) + + stacked_output = torch.cat([sv, sve], dim=-1).view(N * W, self.out_planes * 2, H) + output = self.bn_output(stacked_output).view(N, W, self.out_planes, 2, H).sum(dim=-2) + + if self.width: + output = output.permute(0, 2, 1, 3) + else: + output = output.permute(0, 2, 3, 1) + + if self.stride > 1: + output = self.pooling(output) + + return output + + def print_para(self): + print(self.f_qr) + def reset_parameters(self): + self.qkv_transform.weight.data.normal_(0, math.sqrt(1. / self.in_planes)) + #nn.init.uniform_(self.relative, -0.1, 0.1) + nn.init.normal_(self.relative, 0., math.sqrt(1. / self.group_planes)) + +class AxialAttention_gated_data(nn.Module): + def __init__(self, in_planes, out_planes, groups=8, kernel_size=56, + stride=1, bias=False, width=False): + assert (in_planes % groups == 0) and (out_planes % groups == 0) + super(AxialAttention_gated_data, self).__init__() + self.in_planes = in_planes + self.out_planes = out_planes + self.groups = groups + self.group_planes = out_planes // groups + self.kernel_size = kernel_size + self.stride = stride + self.bias = bias + self.width = width + + # Multi-head self attention + self.qkv_transform = qkv_transform(in_planes, out_planes * 2, kernel_size=1, stride=1, + padding=0, bias=False) + self.bn_qkv = nn.BatchNorm1d(out_planes * 2) + self.bn_similarity = nn.BatchNorm2d(groups * 3) + #self.bn_qk = nn.BatchNorm2d(groups) + #self.bn_qr = nn.BatchNorm2d(groups) + #self.bn_kr = nn.BatchNorm2d(groups) + self.bn_output = nn.BatchNorm1d(out_planes * 2) + + # Priority on encoding + + # self.f_qr = nn.Parameter(torch.tensor(0.1), requires_grad=False) + # self.f_kr = nn.Parameter(torch.tensor(0.1), requires_grad=False) + # self.f_sve = nn.Parameter(torch.tensor(0.1), requires_grad=False) + # self.f_sv = nn.Parameter(torch.tensor(1.0), requires_grad=False) + + self.fcn1 = nn.Linear(in_planes, in_planes) + self.fcn2 = nn.Linear(in_planes, 4) + self.pool = nn.AdaptiveAvgPool2d((1,1)) + + # Position embedding + self.relative = nn.Parameter(torch.randn(self.group_planes * 2, kernel_size * 2 - 1), requires_grad=True) + query_index = torch.arange(kernel_size).unsqueeze(0) + key_index = torch.arange(kernel_size).unsqueeze(1) + relative_index = key_index - query_index + kernel_size - 1 + self.register_buffer('flatten_index', relative_index.view(-1)) + if stride > 1: + self.pooling = nn.AvgPool2d(stride, stride=stride) + + self.reset_parameters() + # self.print_para() + + def forward(self, x): + + if self.width: + x = x.permute(0, 2, 1, 3) + else: + x = x.permute(0, 3, 1, 2) # N, W, C, H + N, W, C, H = x.shape + x = x.contiguous().view(N * W, C, H) + + # import pdb + # pdb.set_trace() + xn = self.pool(x.unsqueeze(3)) + xn = F.relu(self.fcn1(xn.squeeze(2).squeeze(2))) + xn = F.relu(self.fcn2(xn)) + + sig = F.sigmoid(xn) + + sig1 = sig[:,0] + sig2 = sig[:,1] + sig3 = sig[:,2] + sig4 = sig[:,3] + + + # Transformations + # import pdb + # pdb.set_trace() + qkv = self.bn_qkv(self.qkv_transform(x)) + q, k, v = torch.split(qkv.reshape(N * W, self.groups, self.group_planes * 2, H), [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=2) + + # Calculate position embedding + all_embeddings = torch.index_select(self.relative, 1, self.flatten_index).view(self.group_planes * 2, self.kernel_size, self.kernel_size) + q_embedding, k_embedding, v_embedding = torch.split(all_embeddings, [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=0) + qr = torch.einsum('bgci,cij->bgij', q, q_embedding) + kr = torch.einsum('bgci,cij->bgij', k, k_embedding).transpose(2, 3) + qk = torch.einsum('bgci, bgcj->bgij', q, k) + # print(qk.shape, qr.shape, kr.shape) + # import pdb + # pdb.set_trace() + + # multiply by factors + # print(x.shape, qr.shape) + # import pdb + # pdb.set_trace() + qr = sig1.reshape(-1, 1, 1, 1).contiguous()*qr + kr = sig2.reshape(-1, 1, 1, 1).contiguous()*kr + # kr = torch.mul(kr, torch.sigmoid(self.f_kr)) + + stacked_similarity = torch.cat([qk, qr, kr], dim=1) + stacked_similarity = self.bn_similarity(stacked_similarity).view(N * W, 3, self.groups, H, H).sum(dim=1) + #stacked_similarity = self.bn_qr(qr) + self.bn_kr(kr) + self.bn_qk(qk) + # (N, groups, H, H, W) + similarity = F.softmax(stacked_similarity, dim=3) + + + sv = torch.einsum('bgij,bgcj->bgci', similarity, v) + sve = torch.einsum('bgij,cij->bgci', similarity, v_embedding) + + # multiply by factors + sv = sig3.reshape(-1, 1, 1, 1).contiguous()*sv + sve = sig4.reshape(-1, 1, 1, 1).contiguous()*sve + # sv = torch.mul(sv, torch.sigmoid(self.f_sv)) + # sve = torch.mul(sve, torch.sigmoid(self.f_sve)) + + stacked_output = torch.cat([sv, sve], dim=-1).view(N * W, self.out_planes * 2, H) + output = self.bn_output(stacked_output).view(N, W, self.out_planes, 2, H).sum(dim=-2) + + if self.width: + output = output.permute(0, 2, 1, 3) + else: + output = output.permute(0, 2, 3, 1) + + if self.stride > 1: + output = self.pooling(output) + + return output + + def print_para(self): + print(self.f_qr) + def reset_parameters(self): + self.qkv_transform.weight.data.normal_(0, math.sqrt(1. / self.in_planes)) + #nn.init.uniform_(self.relative, -0.1, 0.1) + nn.init.normal_(self.relative, 0., math.sqrt(1. / self.group_planes)) + +class AxialAttention_wopos(nn.Module): + def __init__(self, in_planes, out_planes, groups=8, kernel_size=56, + stride=1, bias=False, width=False): + assert (in_planes % groups == 0) and (out_planes % groups == 0) + super(AxialAttention_wopos, self).__init__() + self.in_planes = in_planes + self.out_planes = out_planes + self.groups = groups + self.group_planes = out_planes // groups + self.kernel_size = kernel_size + self.stride = stride + self.bias = bias + self.width = width + + # Multi-head self attention + self.qkv_transform = qkv_transform(in_planes, out_planes * 2, kernel_size=1, stride=1, + padding=0, bias=False) + self.bn_qkv = nn.BatchNorm1d(out_planes * 2) + self.bn_similarity = nn.BatchNorm2d(groups ) + #self.bn_qk = nn.BatchNorm2d(groups) + #self.bn_qr = nn.BatchNorm2d(groups) + #self.bn_kr = nn.BatchNorm2d(groups) + self.bn_output = nn.BatchNorm1d(out_planes * 1) + + # Position embedding + # self.relative = nn.Parameter(torch.randn(self.group_planes * 2, kernel_size * 2 - 1), requires_grad=True) + # query_index = torch.arange(kernel_size).unsqueeze(0) + # key_index = torch.arange(kernel_size).unsqueeze(1) + # relative_index = key_index - query_index + kernel_size - 1 + # self.register_buffer('flatten_index', relative_index.view(-1)) + if stride > 1: + self.pooling = nn.AvgPool2d(stride, stride=stride) + + self.reset_parameters() + + def forward(self, x): + if self.width: + x = x.permute(0, 2, 1, 3) + else: + x = x.permute(0, 3, 1, 2) # N, W, C, H + N, W, C, H = x.shape + x = x.contiguous().view(N * W, C, H) + + # Transformations + qkv = self.bn_qkv(self.qkv_transform(x)) + q, k, v = torch.split(qkv.reshape(N * W, self.groups, self.group_planes * 2, H), [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=2) + + # Calculate position embedding + # all_embeddings = torch.index_select(self.relative, 1, self.flatten_index).view(self.group_planes * 2, self.kernel_size, self.kernel_size) + # q_embedding, k_embedding, v_embedding = torch.split(all_embeddings, [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=0) + # qr = torch.einsum('bgci,cij->bgij', q, q_embedding) + # kr = torch.einsum('bgci,cij->bgij', k, k_embedding).transpose(2, 3) + qk = torch.einsum('bgci, bgcj->bgij', q, k) + # qr = q + # kr = k.transpose(2, 3) + # # print(qk.shape, qr.shape, kr.shape) + # stacked_similarity = torch.cat([qk, qk, qk], dim=1) + stacked_similarity = self.bn_similarity(qk).reshape(N * W, 1, self.groups, H, H).sum(dim=1).contiguous() + #stacked_similarity = self.bn_qr(qr) + self.bn_kr(kr) + self.bn_qk(qk) + # (N, groups, H, H, W) + # import pdb + # pdb.set_trace() + similarity = F.softmax(stacked_similarity, dim=3) + sv = torch.einsum('bgij,bgcj->bgci', similarity, v) + # sve = torch.einsum('bgij,bgcj->bgci', similarity, v) + # stacked_output = torch.cat([sv, sve], dim=-1).view(N * W, self.out_planes * 2, H) + # import pdb + # pdb.set_trace() + sv = sv.reshape(N*W,self.out_planes * 1, H).contiguous() + output = self.bn_output(sv).reshape(N, W, self.out_planes, 1, H).sum(dim=-2).contiguous() + + + if self.width: + output = output.permute(0, 2, 1, 3) + else: + output = output.permute(0, 2, 3, 1) + + if self.stride > 1: + output = self.pooling(output) + + return output + + def reset_parameters(self): + self.qkv_transform.weight.data.normal_(0, math.sqrt(1. / self.in_planes)) + #nn.init.uniform_(self.relative, -0.1, 0.1) + # nn.init.normal_(self.relative, 0., math.sqrt(1. / self.group_planes)) + +#end of attn definition + +class AxialBlock(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None, kernel_size=56): + super(AxialBlock, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + width = int(planes * (base_width / 64.)) + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv_down = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.hight_block = AxialAttention(width, width, groups=groups, kernel_size=kernel_size) + self.width_block = AxialAttention(width, width, groups=groups, kernel_size=kernel_size, stride=stride, width=True) + self.conv_up = conv1x1(width, planes * self.expansion) + self.bn2 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv_down(x) + out = self.bn1(out) + out = self.relu(out) + # print(out.shape) + out = self.hight_block(out) + out = self.width_block(out) + out = self.relu(out) + + out = self.conv_up(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + +class AxialBlock_dynamic(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None, kernel_size=56): + super(AxialBlock_dynamic, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + width = int(planes * (base_width / 64.)) + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv_down = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.hight_block = AxialAttention_dynamic(width, width, groups=groups, kernel_size=kernel_size) + self.width_block = AxialAttention_dynamic(width, width, groups=groups, kernel_size=kernel_size, stride=stride, width=True) + self.conv_up = conv1x1(width, planes * self.expansion) + self.bn2 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv_down(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.hight_block(out) + out = self.width_block(out) + out = self.relu(out) + + out = self.conv_up(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + +class AxialBlock_gated_data(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None, kernel_size=56): + super(AxialBlock_gated_data, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + width = int(planes * (base_width / 64.)) + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv_down = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.hight_block = AxialAttention_gated_data(width, width, groups=groups, kernel_size=kernel_size) + self.width_block = AxialAttention_gated_data(width, width, groups=groups, kernel_size=kernel_size, stride=stride, width=True) + self.conv_up = conv1x1(width, planes * self.expansion) + self.bn2 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv_down(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.hight_block(out) + out = self.width_block(out) + out = self.relu(out) + + out = self.conv_up(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + +class AxialBlockmod(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None, kernel_size=56): + super(AxialBlockmod, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + # print(kernel_size) + width = int(planes * (base_width / 64.)) + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv_down = conv1x1(inplanes, width) + self.conv1 = nn.Conv2d(width, width, kernel_size = 1) + self.bn1 = norm_layer(width) + self.hight_block = AxialAttention(width, width, groups=groups, kernel_size=kernel_size) + self.width_block = AxialAttention(width, width, groups=groups, kernel_size=kernel_size, stride=stride, width=True) + self.conv_up = conv1x1(width, planes * self.expansion) + self.bn2 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + # pdb.set_trace() + + out = self.conv_down(x) + out = self.bn1(out) + out = self.relu(out) + # print(out.shape) + # out = self.hight_block(out) + # out = self.width_block(out) + # print(self.stride) + out = self.conv1(out) + if self.stride == 2: + out = F.max_pool2d(out,2,2) + if self.downsample is not None: + identity = self.downsample(x) + # out = F.max_pool2d(out,2,2) + # out = self.conv1(out) + + # print(out.shape) + # out = + out = self.relu(out) + + out = self.conv_up(out) + out = self.bn2(out) + + # if self.downsample is not None: + # identity = self.downsample(x) + + # out += identity + out = self.relu(out) + + return out + +class AxialBlock_wopos(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None, kernel_size=56): + super(AxialBlock_wopos, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + # print(kernel_size) + width = int(planes * (base_width / 64.)) + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv_down = conv1x1(inplanes, width) + self.conv1 = nn.Conv2d(width, width, kernel_size = 1) + self.bn1 = norm_layer(width) + self.hight_block = AxialAttention_wopos(width, width, groups=groups, kernel_size=kernel_size) + self.width_block = AxialAttention_wopos(width, width, groups=groups, kernel_size=kernel_size, stride=stride, width=True) + self.conv_up = conv1x1(width, planes * self.expansion) + self.bn2 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + # pdb.set_trace() + + out = self.conv_down(x) + out = self.bn1(out) + out = self.relu(out) + # print(out.shape) + out = self.hight_block(out) + out = self.width_block(out) + # print(self.stride) + # out = self.conv1(out) + # if self.stride == 2: + # out = F.max_pool2d(out,2,2) + # if self.downsample is not None: + # identity = self.downsample(x) + # # out = F.max_pool2d(out,2,2) + # # out = self.conv1(out) + + # # print(out.shape) + # # out = + out = self.relu(out) + + out = self.conv_up(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + +class AxialBlockmod_wopos(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None, kernel_size=56): + super(AxialBlockmod_wopos, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + # print(kernel_size) + width = int(planes * (base_width / 64.)) + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv_down = conv1x1(inplanes, width) + self.conv1 = nn.Conv2d(width, width, kernel_size = 1) + self.bn1 = norm_layer(width) + self.hight_block = AxialAttention_wopos(width, width, groups=groups, kernel_size=kernel_size) + self.width_block = AxialAttention_wopos(width, width, groups=groups, kernel_size=kernel_size, stride=stride, width=True) + self.conv_up = conv1x1(width, planes * self.expansion) + self.bn2 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + # pdb.set_trace() + + out = self.conv_down(x) + out = self.bn1(out) + out = self.relu(out) + # print(out.shape) + # out = self.hight_block(out) + # out = self.width_block(out) + # print(self.stride) + out = self.conv1(out) + if self.stride == 2: + out = F.max_pool2d(out,2,2) + if self.downsample is not None: + identity = self.downsample(x) + # out = F.max_pool2d(out,2,2) + # out = self.conv1(out) + + # print(out.shape) + # out = + out = self.relu(out) + + out = self.conv_up(out) + out = self.bn2(out) + + # if self.downsample is not None: + # identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + +#end of block definition + +class AxialAttentionNet(nn.Module): + + def __init__(self, block, layers, num_classes=1000, zero_init_residual=True, + groups=8, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None, s=0.5): + super(AxialAttentionNet, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = int(64 * s) + self.dilation = 1 + if replace_stride_with_dilation is None: + # each element in the tuple indicates if we should replace + # the 2x2 stride with a dilated convolution instead + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + + + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, int(128 * s), layers[0], kernel_size=56) + self.layer2 = self._make_layer(block, int(256 * s), layers[1], stride=2, kernel_size=56, + dilate=replace_stride_with_dilation[0]) + self.layer3 = self._make_layer(block, int(512 * s), layers[2], stride=2, kernel_size=28, + dilate=replace_stride_with_dilation[1]) + self.layer4 = self._make_layer(block, int(1024 * s), layers[3], stride=2, kernel_size=14, + dilate=replace_stride_with_dilation[2]) + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(int(1024 * block.expansion * s), num_classes) + + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Conv1d)): + if isinstance(m, qkv_transform): + pass + else: + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d, nn.GroupNorm)): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + # Zero-initialize the last BN in each residual branch, + # so that the residual branch starts with zeros, and each residual block behaves like an identity. + # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 + if zero_init_residual: + for m in self.modules(): + if isinstance(m, AxialBlock): + nn.init.constant_(m.bn2.weight, 0) + + def _make_layer(self, block, planes, blocks, kernel_size=56, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, groups=self.groups, + base_width=self.base_width, dilation=previous_dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + self.inplanes = planes * block.expansion + if stride != 1: + kernel_size = kernel_size // 2 + + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + + return nn.Sequential(*layers) + + def _forward_impl(self, x): + # See note [TorchScript super()] + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.fc(x) + + return x + + def forward(self, x): + return self._forward_impl(x) + +class ResAxialAttentionUNet(nn.Module): + + def __init__(self, block, layers, num_classes=2, zero_init_residual=True, + groups=8, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None, s=0.125, img_size = 128,imgchan = 3): + super(ResAxialAttentionUNet, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = int(64 * s) + self.dilation = 1 + if replace_stride_with_dilation is None: + # each element in the tuple indicates if we should replace + # the 2x2 stride with a dilated convolution instead + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + # self.conv2 = nn.Conv2d(self.inplanes, 128, kernel_size=3, stride=1, padding=1, bias=False) + # self.conv3 = nn.Conv2d(128, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = norm_layer(self.inplanes) + # self.bn2 = norm_layer(128) + # self.bn3 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, int(128 * s), layers[0], kernel_size= (img_size//2)) + self.layer2 = self._make_layer(block, int(256 * s), layers[1], stride=2, kernel_size=(img_size//2), + dilate=replace_stride_with_dilation[0]) + self.layer3 = self._make_layer(block, int(512 * s), layers[2], stride=2, kernel_size=(img_size//4), + dilate=replace_stride_with_dilation[1]) + self.layer4 = self._make_layer(block, int(1024 * s), layers[3], stride=2, kernel_size=(img_size//8), + dilate=replace_stride_with_dilation[2]) + + # self.layer1 = nn.Conv2d(8,32,kernel_size=3, stride=1, padding=1) + # self.layer2 = nn.Conv2d(32,64,kernel_size=3, stride=1, padding=1) + # dilate=replace_stride_with_dilation[0]) + # self.layer3 = self._make_layer(block, int(512 * s), layers[2], stride=2, kernel_size=(img_size//4), + # dilate=replace_stride_with_dilation[1]) + # self.layer4 = self._make_layer(block, int(1024 * s), layers[3], stride=2, kernel_size=(img_size//8), + # dilate=replace_stride_with_dilation[2]) + + # Decoder + self.decoder1 = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) + self.decoder2 = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) + self.decoder3 = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) + self.decoder4 = nn.Conv2d(int(512*s) , int(256*s), kernel_size=3, stride=1, padding=1) + self.decoder5 = nn.Conv2d(int(256*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + self.adjust = nn.Conv2d(int(128*s) , num_classes, kernel_size=1, stride=1, padding=0) + self.soft = nn.Softmax(dim=1) + + + # self.conv1_1 = nn.Conv2d(32,8,kernel_size=1, stride=1, padding=0) + # self.conv1_2 = nn.Conv2d(64,8,kernel_size=1, stride=1, padding=0) + # for m in self.modules(): + # if isinstance(m, (nn.Conv2d, nn.Conv1d)): + # if isinstance(m, qkv_transform): + # pass + # else: + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + # elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d, nn.GroupNorm)): + # nn.init.constant_(m.weight, 1) + # nn.init.constant_(m.bias, 0) + + # Zero-initialize the last BN in each residual branch, + # so that the residual branch starts with zeros, and each residual block behaves like an identity. + # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 + # if zero_init_residual: + # for m in self.modules(): + # if isinstance(m, AxialBlock): + # nn.init.constant_(m.bn2.weight, 0) + + def _make_layer(self, block, planes, blocks, kernel_size=56, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, groups=self.groups, + base_width=self.base_width, dilation=previous_dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + self.inplanes = planes * block.expansion + if stride != 1: + kernel_size = kernel_size // 2 + + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + + return nn.Sequential(*layers) + + def _forward_impl(self, x): + # See note [TorchScript super()] + # AxialAttention Encoder + # pdb.set_trace() + x = self.conv1(x) + x = self.bn1(x) + # x = self.relu(x) + # x = self.conv2(x) + # x = self.bn2(x) + # x = self.relu(x) + # x = self.conv3(x) + # x = self.bn3(x) + # x = self.relu(x) + # # x = F.max_pool2d(x,2,2) + x = self.relu(x) + + # x = self.maxpool(x) + # pdb.set_trace() + # print(x.shape) + x1 = self.layer1(x) + # x1 = F.relu(F.max_pool2d(x1,2,2)) + # x1 = self.conv1_1(x1) + # print(x1.shape) + x2 = self.layer2(x1) + # print(x2.shape) + x3 = self.layer3(x2) + # print(x3.shape) + x4 = self.layer4(x3) + # print(x4.shape) + # pdb.set_trace() + # Transposed Convolution Decoder + x = F.relu(F.interpolate(self.decoder1(x4), scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x4) + x = F.relu(F.interpolate(self.decoder2(x) , scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x3) + x = F.relu(F.interpolate(self.decoder3(x) , scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x2) + x = F.relu(F.interpolate(self.decoder4(x) , scale_factor=(2,2), mode ='bilinear')) + # print(x.shape, x1.shape) + # x = torch.add(x, x1) + x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) + x = self.soft(self.adjust(F.relu(x))) + # pdb.set_trace() + return x + + def forward(self, x): + return self._forward_impl(x) + +class unetplus(nn.Module): + + def __init__(self, block, layers, num_classes=2, zero_init_residual=True, + groups=8, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None, s=0.125, img_size = 128,imgchan = 3): + super(unetplus, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = int(64 * s) + self.dilation = 1 + if replace_stride_with_dilation is None: + # each element in the tuple indicates if we should replace + # the 2x2 stride with a dilated convolution instead + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, int(128 * s), layers[0], kernel_size= (img_size//2)) + self.layer2 = self._make_layer(block, int(256 * s), layers[1], stride=2, kernel_size=(img_size//2), + dilate=replace_stride_with_dilation[0]) + self.layer3 = self._make_layer(block, int(512 * s), layers[2], stride=2, kernel_size=(img_size//4), + dilate=replace_stride_with_dilation[1]) + self.layer4 = self._make_layer(block, int(1024 * s), layers[3], stride=2, kernel_size=(img_size//8), + dilate=replace_stride_with_dilation[2]) + + # Decoder + self.decoder1 = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) + self.decoder2 = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) + self.decoder3 = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) + self.decoder4 = nn.Conv2d(int(512*s) , int(256*s), kernel_size=3, stride=1, padding=1) + self.decoder5 = nn.Conv2d(int(256*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + self.adjust = nn.Conv2d(int(128*s) , num_classes, kernel_size=1, stride=1, padding=0) + self.soft = nn.Softmax(dim=1) + + self.inter1= nn.Conv2d(32, 32, 3, stride=1, padding=1) + self.inter2= nn.Conv2d(64, 64, 3, stride=1, padding=1) + self.inter3= nn.Conv2d(128, 128, 3, stride=1, padding=1) + self.inter4= nn.Conv2d(256, 256, 3, stride=1, padding=1) + # self.inter5= nn.Conv2d(32, 32, 3, stride=1, padding=1) + + self.inte1= nn.Conv2d(32, 2, 1, stride=1, padding=0) + self.inte2= nn.Conv2d(64, 2, 1, stride=1, padding=0) + self.inte3= nn.Conv2d(128, 2, 1, stride=1, padding=0) + self.inte4= nn.Conv2d(256, 2, 1, stride=1, padding=0) + + # for m in self.modules(): + # if isinstance(m, (nn.Conv2d, nn.Conv1d)): + # if isinstance(m, qkv_transform): + # pass + # else: + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + # elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d, nn.GroupNorm)): + # nn.init.constant_(m.weight, 1) + # nn.init.constant_(m.bias, 0) + + # Zero-initialize the last BN in each residual branch, + # so that the residual branch starts with zeros, and each residual block behaves like an identity. + # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 + # if zero_init_residual: + # for m in self.modules(): + # if isinstance(m, AxialBlock): + # nn.init.constant_(m.bn2.weight, 0) + + def _make_layer(self, block, planes, blocks, kernel_size=56, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, groups=self.groups, + base_width=self.base_width, dilation=previous_dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + self.inplanes = planes * block.expansion + if stride != 1: + kernel_size = kernel_size // 2 + + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + + return nn.Sequential(*layers) + + def _forward_impl(self, x): + # See note [TorchScript super()] + # AxialAttention Encoder + # pdb.set_trace() + x = self.conv1(x) + x = self.bn1(x) + # x = F.max_pool2d(x,2,2) + x = self.relu(x) + + # x = self.maxpool(x) + # pdb.set_trace() + x1 = self.layer1(x) + # print(x1.shape) + x2 = self.layer2(x1) + # print(x2.shape) + x3 = self.layer3(x2) + # print(x3.shape) + x4 = self.layer4(x3) + # print(x4.shape) + # pdb.set_trace() + # Transposed Convolution Decoder + x = F.relu(F.interpolate(self.decoder1(x4), scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x4) + x = F.relu(F.interpolate(self.decoder2(x) , scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x3) + x = F.relu(F.interpolate(self.decoder3(x) , scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x2) + x = F.relu(F.interpolate(self.decoder4(x) , scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x1) + x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) + x = self.soft(self.adjust(F.relu(x))) + # pdb.set_trace() + return x + + def forward(self, x): + return self._forward_impl(x) + +class mix(nn.Module): + + def __init__(self, block, layers, num_classes=2, zero_init_residual=True, + groups=8, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None, s=0.125, img_size = 128,imgchan = 3): + super(mix, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = int(64 * s) + self.dilation = 1 + if replace_stride_with_dilation is None: + # each element in the tuple indicates if we should replace + # the 2x2 stride with a dilated convolution instead + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, int(128 * s), layers[0], kernel_size= (img_size//2)) + self.layer2 = self._make_layer(block, int(256 * s), layers[1], stride=2, kernel_size=(img_size//2), + dilate=replace_stride_with_dilation[0]) + self.layer3 = self._make_layer(block, int(512 * s), layers[2], stride=2, kernel_size=(img_size//4), + dilate=replace_stride_with_dilation[1]) + # self.layer4 = self._make_layer(block, int(1024 * s), layers[3], stride=2, kernel_size=(img_size//8), + # dilate=replace_stride_with_dilation[2]) + + # Decoder + # self.decoder1 = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) + # self.decoder2 = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) + self.decoder3 = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) + self.decoder4 = nn.Conv2d(int(512*s) , int(256*s), kernel_size=3, stride=1, padding=1) + self.decoder5 = nn.Conv2d(int(256*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + self.adjust = nn.Conv2d(int(128*s) , num_classes, kernel_size=1, stride=1, padding=0) + self.soft = nn.Softmax(dim=1) + + + self.conv1_p = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1_p = norm_layer(self.inplanes) + self.relu_p = nn.ReLU(inplace=True) + + img_size_p = img_size // 4 + + self.layer1_p = self._make_layer(block, int(128 * s), layers[0], kernel_size= (img_size_p//2)) + self.layer2_p = self._make_layer(block, int(256 * s), layers[1], stride=2, kernel_size=(img_size_p//2), + dilate=replace_stride_with_dilation[0]) + self.layer3_p = self._make_layer(block, int(512 * s), layers[2], stride=2, kernel_size=(img_size_p//4), + dilate=replace_stride_with_dilation[1]) + self.layer4_p = self._make_layer(block, int(1024 * s), layers[3], stride=2, kernel_size=(img_size_p//8), + dilate=replace_stride_with_dilation[2]) + + # Decoder + self.decoder1_p = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) + self.decoder2_p = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) + self.decoder3_p = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) + self.decoder4_p = nn.Conv2d(int(512*s) , int(256*s), kernel_size=3, stride=1, padding=1) + self.decoder5_p = nn.Conv2d(int(256*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + + self.decoderf = nn.Conv2d(int(128*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + self.adjust_p = nn.Conv2d(int(128*s) , num_classes, kernel_size=1, stride=1, padding=0) + self.soft_p = nn.Softmax(dim=1) + + # for m in self.modules(): + # if isinstance(m, (nn.Conv2d, nn.Conv1d)): + # if isinstance(m, qkv_transform): + # pass + # else: + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + # elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d, nn.GroupNorm)): + # nn.init.constant_(m.weight, 1) + # nn.init.constant_(m.bias, 0) + + # Zero-initialize the last BN in each residual branch, + # so that the residual branch starts with zeros, and each residual block behaves like an identity. + # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 + # if zero_init_residual: + # for m in self.modules(): + # if isinstance(m, AxialBlock): + # nn.init.constant_(m.bn2.weight, 0) + + def _make_layer(self, block, planes, blocks, kernel_size=56, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, groups=self.groups, + base_width=self.base_width, dilation=previous_dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + self.inplanes = planes * block.expansion + if stride != 1: + kernel_size = kernel_size // 2 + + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + + return nn.Sequential(*layers) + + def _forward_impl(self, x): + # See note [TorchScript super()] + # AxialAttention Encoder + # pdb.set_trace() + xin = x.clone() + x = self.conv1(x) + x = self.bn1(x) + # x = F.max_pool2d(x,2,2) + x = self.relu(x) + + # x = self.maxpool(x) + # pdb.set_trace() + x1 = self.layer1(x) + # print(x1.shape) + x2 = self.layer2(x1) + # print(x2.shape) + x3 = self.layer3(x2) + # # print(x3.shape) + # x4 = self.layer4(x3) + # # print(x4.shape) + # # pdb.set_trace() + # # Transposed Convolution Decoder + # x = F.relu(F.interpolate(self.decoder1(x4), scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x4) + # x = F.relu(F.interpolate(self.decoder2(x4) , scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x3) + x = F.relu(F.interpolate(self.decoder3(x3) , scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x2) + x = F.relu(F.interpolate(self.decoder4(x2) , scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x1) + x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) + # print(x.shape) + + # end of full image training + + # y_out = torch.ones((1,2,128,128)) + x_loc = x.clone() + # x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) + #start + for i in range(0,4): + for j in range(0,4): + + x_p = xin[:,:,32*i:32*(i+1),32*j:32*(j+1)] + # begin patch wise + x_p = self.conv1_p(x_p) + x_p = self.bn1_p(x_p) + # x = F.max_pool2d(x,2,2) + x_p = self.relu(x_p) + + # x = self.maxpool(x) + # pdb.set_trace() + x1_p = self.layer1_p(x_p) + # print(x1.shape) + x2_p = self.layer2_p(x1_p) + # print(x2.shape) + x3_p = self.layer3_p(x2_p) + # # print(x3.shape) + x4_p = self.layer4_p(x3_p) + # # print(x4.shape) + # # pdb.set_trace() + # # Transposed Convolution Decoder + x_p = F.relu(F.interpolate(self.decoder1_p(x4_p), scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x4_p) + x_p = F.relu(F.interpolate(self.decoder2_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x3_p) + x_p = F.relu(F.interpolate(self.decoder3_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x2_p) + x_p = F.relu(F.interpolate(self.decoder4_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x1_p) + x_p = F.relu(F.interpolate(self.decoder5_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + # x_p = self.soft_p(self.adjust_p(F.relu(x_p))) + # print(x_p.shape) + x_loc[:,:,32*i:32*(i+1),32*j:32*(j+1)] = x_p + + x = torch.add(x,x_loc) + x = F.relu(self.decoderf(x)) + + x = self.soft(self.adjust(F.relu(x))) + + # pdb.set_trace() + return x + + def forward(self, x): + return self._forward_impl(x) + +class mix_wopos(nn.Module): + + def __init__(self, block, block_2, layers, num_classes=2, zero_init_residual=True, + groups=8, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None, s=0.125, img_size = 128,imgchan = 3): + super(mix_wopos, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = int(64 * s) + self.dilation = 1 + if replace_stride_with_dilation is None: + # each element in the tuple indicates if we should replace + # the 2x2 stride with a dilated convolution instead + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + self.conv2 = nn.Conv2d(self.inplanes, 128, kernel_size=3, stride=1, padding=1, bias=False) + self.conv3 = nn.Conv2d(128, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = norm_layer(self.inplanes) + self.bn2 = norm_layer(128) + self.bn3 = norm_layer(self.inplanes) + # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, int(128 * s), layers[0], kernel_size= (img_size//2)) + self.layer2 = self._make_layer(block, int(256 * s), layers[1], stride=2, kernel_size=(img_size//2), + dilate=replace_stride_with_dilation[0]) + # self.layer3 = self._make_layer(block, int(512 * s), layers[2], stride=2, kernel_size=(img_size//4), + # dilate=replace_stride_with_dilation[1]) + # self.layer4 = self._make_layer(block, int(1024 * s), layers[3], stride=2, kernel_size=(img_size//8), + # dilate=replace_stride_with_dilation[2]) + + # Decoder + # self.decoder1 = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) + # self.decoder2 = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) + # self.decoder3 = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) + self.decoder4 = nn.Conv2d(int(512*s) , int(256*s), kernel_size=3, stride=1, padding=1) + self.decoder5 = nn.Conv2d(int(256*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + self.adjust = nn.Conv2d(int(128*s) , num_classes, kernel_size=1, stride=1, padding=0) + self.soft = nn.Softmax(dim=1) + + + self.conv1_p = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + self.conv2_p = nn.Conv2d(self.inplanes,128, kernel_size=3, stride=1, padding=1, + bias=False) + self.conv3_p = nn.Conv2d(128, self.inplanes, kernel_size=3, stride=1, padding=1, + bias=False) + # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1_p = norm_layer(self.inplanes) + self.bn2_p = norm_layer(128) + self.bn3_p = norm_layer(self.inplanes) + + self.relu_p = nn.ReLU(inplace=True) + + img_size_p = img_size // 4 + + self.layer1_p = self._make_layer(block_2, int(128 * s), layers[0], kernel_size= (img_size_p//2)) + self.layer2_p = self._make_layer(block_2, int(256 * s), layers[1], stride=2, kernel_size=(img_size_p//2), + dilate=replace_stride_with_dilation[0]) + self.layer3_p = self._make_layer(block_2, int(512 * s), layers[2], stride=2, kernel_size=(img_size_p//4), + dilate=replace_stride_with_dilation[1]) + self.layer4_p = self._make_layer(block_2, int(1024 * s), layers[3], stride=2, kernel_size=(img_size_p//8), + dilate=replace_stride_with_dilation[2]) + + # Decoder + self.decoder1_p = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) + self.decoder2_p = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) + self.decoder3_p = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) + self.decoder4_p = nn.Conv2d(int(512*s) , int(256*s), kernel_size=3, stride=1, padding=1) + self.decoder5_p = nn.Conv2d(int(256*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + + self.decoderf = nn.Conv2d(int(128*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + self.adjust_p = nn.Conv2d(int(128*s) , num_classes, kernel_size=1, stride=1, padding=0) + self.soft_p = nn.Softmax(dim=1) + + # for m in self.modules(): + # if isinstance(m, (nn.Conv2d, nn.Conv1d)): + # if isinstance(m, qkv_transform): + # pass + # else: + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + # elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d, nn.GroupNorm)): + # nn.init.constant_(m.weight, 1) + # nn.init.constant_(m.bias, 0) + + # Zero-initialize the last BN in each residual branch, + # so that the residual branch starts with zeros, and each residual block behaves like an identity. + # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 + # if zero_init_residual: + # for m in self.modules(): + # if isinstance(m, AxialBlock): + # nn.init.constant_(m.bn2.weight, 0) + + def _make_layer(self, block, planes, blocks, kernel_size=56, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, groups=self.groups, + base_width=self.base_width, dilation=previous_dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + self.inplanes = planes * block.expansion + if stride != 1: + kernel_size = kernel_size // 2 + + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + + return nn.Sequential(*layers) + + def _forward_impl(self, x): + # See note [TorchScript super()] + # AxialAttention Encoder + # pdb.set_trace() + xin = x.clone() + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + x = self.conv3(x) + x = self.bn3(x) + # x = F.max_pool2d(x,2,2) + x = self.relu(x) + + # x = self.maxpool(x) + # pdb.set_trace() + x1 = self.layer1(x) + # print(x1.shape) + x2 = self.layer2(x1) + # print(x2.shape) + # x3 = self.layer3(x2) + # # print(x3.shape) + # x4 = self.layer4(x3) + # # print(x4.shape) + # # pdb.set_trace() + # # Transposed Convolution Decoder + # x = F.relu(F.interpolate(self.decoder1(x4), scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x4) + # x = F.relu(F.interpolate(self.decoder2(x4) , scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x3) + # x = F.relu(F.interpolate(self.decoder3(x3) , scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x2) + x = F.relu(F.interpolate(self.decoder4(x2) , scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x1) + x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) + # print(x.shape) + + # end of full image training + + # y_out = torch.ones((1,2,128,128)) + x_loc = x.clone() + # x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) + #start + for i in range(0,4): + for j in range(0,4): + + x_p = xin[:,:,32*i:32*(i+1),32*j:32*(j+1)] + # begin patch wise + x_p = self.conv1_p(x_p) + x_p = self.bn1_p(x_p) + # x = F.max_pool2d(x,2,2) + x_p = self.relu(x_p) + + x_p = self.conv2_p(x_p) + x_p = self.bn2_p(x_p) + # x = F.max_pool2d(x,2,2) + x_p = self.relu(x_p) + x_p = self.conv3_p(x_p) + x_p = self.bn3_p(x_p) + # x = F.max_pool2d(x,2,2) + x_p = self.relu(x_p) + + # x = self.maxpool(x) + # pdb.set_trace() + x1_p = self.layer1_p(x_p) + # print(x1.shape) + x2_p = self.layer2_p(x1_p) + # print(x2.shape) + x3_p = self.layer3_p(x2_p) + # # print(x3.shape) + x4_p = self.layer4_p(x3_p) + # # print(x4.shape) + # # pdb.set_trace() + # # Transposed Convolution Decoder + x_p = F.relu(F.interpolate(self.decoder1_p(x4_p), scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x4_p) + x_p = F.relu(F.interpolate(self.decoder2_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x3_p) + x_p = F.relu(F.interpolate(self.decoder3_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x2_p) + x_p = F.relu(F.interpolate(self.decoder4_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x1_p) + x_p = F.relu(F.interpolate(self.decoder5_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + # x_p = self.soft_p(self.adjust_p(F.relu(x_p))) + # print(x_p.shape) + x_loc[:,:,32*i:32*(i+1),32*j:32*(j+1)] = x_p + + x = torch.add(x,x_loc) + x = F.relu(self.decoderf(x)) + + x = self.adjust(F.relu(x)) + + # pdb.set_trace() + return x + + def forward(self, x): + return self._forward_impl(x) + +class mix_wopos_512(nn.Module): + + def __init__(self, block, block_2, layers, num_classes=2, zero_init_residual=True, + groups=8, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None, s=0.125, img_size = 128,imgchan = 3): + super(mix_wopos_512, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = int(64 * s) + self.dilation = 1 + if replace_stride_with_dilation is None: + # each element in the tuple indicates if we should replace + # the 2x2 stride with a dilated convolution instead + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = norm_layer(self.inplanes) + self.conv2 = nn.Conv2d(self.inplanes, 128, kernel_size=3, stride=1, padding=1, bias=False) + self.conv3 = nn.Conv2d(128, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = norm_layer(self.inplanes) + self.bn2 = norm_layer(128) + self.bn3 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, int(128 * s), layers[0], kernel_size= (img_size//2)) + self.layer2 = self._make_layer(block, int(256 * s), layers[1], stride=2, kernel_size=(img_size//2), + dilate=replace_stride_with_dilation[0]) + self.layer3 = self._make_layer(block, int(512 * s), layers[2], stride=2, kernel_size=(img_size//4), + dilate=replace_stride_with_dilation[1]) + # self.layer4 = self._make_layer(block, int(1024 * s), layers[3], stride=2, kernel_size=(img_size//8), + # dilate=replace_stride_with_dilation[2]) + + # Decoder + # self.decoder1 = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) + # self.decoder2 = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) + self.decoder3 = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) + self.decoder4 = nn.Conv2d(int(512*s) , int(256*s), kernel_size=3, stride=1, padding=1) + self.decoder5 = nn.Conv2d(int(256*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + self.adjust = nn.Conv2d(int(128*s) , num_classes, kernel_size=1, stride=1, padding=0) + self.soft = nn.Softmax(dim=1) + + self.conv1_p = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + self.conv2_p = nn.Conv2d(self.inplanes,128, kernel_size=3, stride=1, padding=1, + bias=False) + self.conv3_p = nn.Conv2d(128, self.inplanes, kernel_size=3, stride=1, padding=1, + bias=False) + # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1_p = norm_layer(self.inplanes) + self.bn2_p = norm_layer(128) + self.bn3_p = norm_layer(self.inplanes) + self.conv1_p = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1_p = norm_layer(self.inplanes) + self.relu_p = nn.ReLU(inplace=True) + + img_size_p = img_size // 4 + + self.layer1_p = self._make_layer(block_2, int(128 * s), layers[0], kernel_size= (img_size_p//2)) + self.layer2_p = self._make_layer(block_2, int(256 * s), layers[1], stride=2, kernel_size=(img_size_p//2), + dilate=replace_stride_with_dilation[0]) + self.layer3_p = self._make_layer(block_2, int(512 * s), layers[2], stride=2, kernel_size=(img_size_p//4), + dilate=replace_stride_with_dilation[1]) + self.layer4_p = self._make_layer(block_2, int(1024 * s), layers[3], stride=2, kernel_size=(img_size_p//8), + dilate=replace_stride_with_dilation[2]) + + # Decoder + self.decoder1_p = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) + self.decoder2_p = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) + self.decoder3_p = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) + self.decoder4_p = nn.Conv2d(int(512*s) , int(256*s), kernel_size=3, stride=1, padding=1) + self.decoder5_p = nn.Conv2d(int(256*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + + self.decoderf = nn.Conv2d(int(128*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + self.adjust_p = nn.Conv2d(int(128*s) , num_classes, kernel_size=1, stride=1, padding=0) + self.soft_p = nn.Softmax(dim=1) + + # for m in self.modules(): + # if isinstance(m, (nn.Conv2d, nn.Conv1d)): + # if isinstance(m, qkv_transform): + # pass + # else: + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + # elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d, nn.GroupNorm)): + # nn.init.constant_(m.weight, 1) + # nn.init.constant_(m.bias, 0) + + # Zero-initialize the last BN in each residual branch, + # so that the residual branch starts with zeros, and each residual block behaves like an identity. + # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 + # if zero_init_residual: + # for m in self.modules(): + # if isinstance(m, AxialBlock): + # nn.init.constant_(m.bn2.weight, 0) + + def _make_layer(self, block, planes, blocks, kernel_size=56, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, groups=self.groups, + base_width=self.base_width, dilation=previous_dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + self.inplanes = planes * block.expansion + if stride != 1: + kernel_size = kernel_size // 2 + + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + + return nn.Sequential(*layers) + + def _forward_impl(self, x): + # See note [TorchScript super()] + # AxialAttention Encoder + # pdb.set_trace() + xin = x.clone() + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + x = self.conv3(x) + x = self.bn3(x) + # x = F.max_pool2d(x,2,2) + x = self.relu(x) + + # x = self.maxpool(x) + # pdb.set_trace() + x1 = self.layer1(x) + # print(x1.shape) + x2 = self.layer2(x1) + # print(x2.shape) + x3 = self.layer3(x2) + # # print(x3.shape) + # x4 = self.layer4(x3) + # # print(x4.shape) + # # pdb.set_trace() + # # Transposed Convolution Decoder + # x = F.relu(F.interpolate(self.decoder1(x4), scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x4) + # x = F.relu(F.interpolate(self.decoder2(x4) , scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x3) + x = F.relu(F.interpolate(self.decoder3(x3) , scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x2) + x = F.relu(F.interpolate(self.decoder4(x2) , scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x1) + x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) + # print(x.shape) + + # end of full image training + + # y_out = torch.ones((1,2,128,128)) + x_loc = x.clone() + # x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) + #start + for i in range(0,4): + for j in range(0,4): + + x_p = xin[:,:,128*i:128*(i+1),128*j:128*(j+1)] + # begin patch wise + x_p = self.conv1_p(x_p) + x_p = self.bn1_p(x_p) + # x = F.max_pool2d(x,2,2) + x_p = self.relu(x_p) + + x_p = self.conv2_p(x_p) + x_p = self.bn2_p(x_p) + # x = F.max_pool2d(x,2,2) + x_p = self.relu(x_p) + x_p = self.conv3_p(x_p) + x_p = self.bn3_p(x_p) + # x = F.max_pool2d(x,2,2) + x_p = self.relu(x_p) + + # x = self.maxpool(x) + # pdb.set_trace() + x1_p = self.layer1_p(x_p) + # print(x1.shape) + x2_p = self.layer2_p(x1_p) + # print(x2.shape) + x3_p = self.layer3_p(x2_p) + # # # print(x3.shape) + x4_p = self.layer4_p(x3_p) + # # # print(x4.shape) + # # # pdb.set_trace() + # # # Transposed Convolution Decoder + x_p = F.relu(F.interpolate(self.decoder1_p(x4_p), scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x4_p) + x_p = F.relu(F.interpolate(self.decoder2_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x3_p) + x_p = F.relu(F.interpolate(self.decoder3_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x2_p) + x_p = F.relu(F.interpolate(self.decoder4_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x1_p) + x_p = F.relu(F.interpolate(self.decoder5_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + # x_p = self.soft_p(self.adjust_p(F.relu(x_p))) + # print(x_p.shape) + x_loc[:,:,128*i:128*(i+1),128*j:128*(j+1)] = x_p + + x = torch.add(x,x_loc) + x = F.relu(self.decoderf(x)) + + x = self.soft(self.adjust(F.relu(x))) + + # pdb.set_trace() + return x + + def forward(self, x): + return self._forward_impl(x) + +class mix_512(nn.Module): + + def __init__(self, block, layers, num_classes=2, zero_init_residual=True, + groups=8, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None, s=0.125, img_size = 128,imgchan = 3): + super(mix_512, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = int(64 * s) + self.dilation = 1 + if replace_stride_with_dilation is None: + # each element in the tuple indicates if we should replace + # the 2x2 stride with a dilated convolution instead + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, int(128 * s), layers[0], kernel_size= (img_size//2)) + self.layer2 = self._make_layer(block, int(256 * s), layers[1], stride=2, kernel_size=(img_size//2), + dilate=replace_stride_with_dilation[0]) + self.layer3 = self._make_layer(block, int(512 * s), layers[2], stride=2, kernel_size=(img_size//4), + dilate=replace_stride_with_dilation[1]) + # self.layer4 = self._make_layer(block, int(1024 * s), layers[3], stride=2, kernel_size=(img_size//8), + # dilate=replace_stride_with_dilation[2]) + + # Decoder + # self.decoder1 = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) + # self.decoder2 = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) + self.decoder3 = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) + self.decoder4 = nn.Conv2d(int(512*s) , int(256*s), kernel_size=3, stride=1, padding=1) + self.decoder5 = nn.Conv2d(int(256*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + self.adjust = nn.Conv2d(int(128*s) , num_classes, kernel_size=1, stride=1, padding=0) + self.soft = nn.Softmax(dim=1) + + + self.conv1_p = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1_p = norm_layer(self.inplanes) + self.relu_p = nn.ReLU(inplace=True) + + img_size_p = img_size // 4 + + self.layer1_p = self._make_layer(block, int(128 * s), layers[0], kernel_size= (img_size_p//2)) + self.layer2_p = self._make_layer(block, int(256 * s), layers[1], stride=2, kernel_size=(img_size_p//2), + dilate=replace_stride_with_dilation[0]) + self.layer3_p = self._make_layer(block, int(512 * s), layers[2], stride=2, kernel_size=(img_size_p//4), + dilate=replace_stride_with_dilation[1]) + self.layer4_p = self._make_layer(block, int(1024 * s), layers[3], stride=2, kernel_size=(img_size_p//8), + dilate=replace_stride_with_dilation[2]) + + # Decoder + self.decoder1_p = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) + self.decoder2_p = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) + self.decoder3_p = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) + self.decoder4_p = nn.Conv2d(int(512*s) , int(256*s), kernel_size=3, stride=1, padding=1) + self.decoder5_p = nn.Conv2d(int(256*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + + self.decoderf = nn.Conv2d(int(128*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + self.adjust_p = nn.Conv2d(int(128*s) , num_classes, kernel_size=1, stride=1, padding=0) + self.soft_p = nn.Softmax(dim=1) + + # for m in self.modules(): + # if isinstance(m, (nn.Conv2d, nn.Conv1d)): + # if isinstance(m, qkv_transform): + # pass + # else: + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + # elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d, nn.GroupNorm)): + # nn.init.constant_(m.weight, 1) + # nn.init.constant_(m.bias, 0) + + # Zero-initialize the last BN in each residual branch, + # so that the residual branch starts with zeros, and each residual block behaves like an identity. + # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 + # if zero_init_residual: + # for m in self.modules(): + # if isinstance(m, AxialBlock): + # nn.init.constant_(m.bn2.weight, 0) + + def _make_layer(self, block, planes, blocks, kernel_size=56, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, groups=self.groups, + base_width=self.base_width, dilation=previous_dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + self.inplanes = planes * block.expansion + if stride != 1: + kernel_size = kernel_size // 2 + + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + + return nn.Sequential(*layers) + + def _forward_impl(self, x): + # See note [TorchScript super()] + # AxialAttention Encoder + # pdb.set_trace() + xin = x.clone() + x = self.conv1(x) + x = self.bn1(x) + # x = F.max_pool2d(x,2,2) + x = self.relu(x) + + # x = self.maxpool(x) + # pdb.set_trace() + x1 = self.layer1(x) + # print(x1.shape) + x2 = self.layer2(x1) + # print(x2.shape) + x3 = self.layer3(x2) + # # print(x3.shape) + # x4 = self.layer4(x3) + # # print(x4.shape) + # # pdb.set_trace() + # # Transposed Convolution Decoder + # x = F.relu(F.interpolate(self.decoder1(x4), scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x4) + # x = F.relu(F.interpolate(self.decoder2(x4) , scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x3) + x = F.relu(F.interpolate(self.decoder3(x3) , scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x2) + x = F.relu(F.interpolate(self.decoder4(x2) , scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x1) + x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) + # print(x.shape) + + # end of full image training + + # y_out = torch.ones((1,2,128,128)) + x_loc = x.clone() + # x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) + #start + for i in range(0,4): + for j in range(0,4): + + x_p = xin[:,:,128*i:128*(i+1),128*j:128*(j+1)] + # begin patch wise + x_p = self.conv1_p(x_p) + x_p = self.bn1_p(x_p) + # x = F.max_pool2d(x,2,2) + x_p = self.relu(x_p) + + # x = self.maxpool(x) + # pdb.set_trace() + x1_p = self.layer1_p(x_p) + # print(x1.shape) + x2_p = self.layer2_p(x1_p) + # print(x2.shape) + x3_p = self.layer3_p(x2_p) + # # print(x3.shape) + x4_p = self.layer4_p(x3_p) + # # print(x4.shape) + # # pdb.set_trace() + # # Transposed Convolution Decoder + x_p = F.relu(F.interpolate(self.decoder1_p(x4_p), scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x4_p) + x_p = F.relu(F.interpolate(self.decoder2_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x3_p) + x_p = F.relu(F.interpolate(self.decoder3_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x2_p) + x_p = F.relu(F.interpolate(self.decoder4_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + x_p = torch.add(x_p, x1_p) + x_p = F.relu(F.interpolate(self.decoder5_p(x_p) , scale_factor=(2,2), mode ='bilinear')) + # x_p = self.soft_p(self.adjust_p(F.relu(x_p))) + # print(x_p.shape) + x_loc[:,:,128*i:128*(i+1),128*j:128*(j+1)] = x_p + + x = torch.add(x,x_loc) + x = F.relu(self.decoderf(x)) + + x = self.soft(self.adjust(F.relu(x))) + + # pdb.set_trace() + return x + + def forward(self, x): + return self._forward_impl(x) + +class ResAxialAttentionUNetshallow(nn.Module): + + def __init__(self, block, layers, num_classes=2, zero_init_residual=True, + groups=8, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None, s=0.125, img_size = 128, imgchan = 3): + super(ResAxialAttentionUNetshallow, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = int(64 * s) + self.dilation = 1 + if replace_stride_with_dilation is None: + # each element in the tuple indicates if we should replace + # the 2x2 stride with a dilated convolution instead + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, int(128 * s), layers[0], kernel_size= (img_size//2)) + self.layer2 = self._make_layer(block, int(256 * s), layers[1], stride=2, kernel_size=(img_size//2), + dilate=replace_stride_with_dilation[0]) + # self.layer3 = self._make_layer(block, int(512 * s), layers[2], stride=2, kernel_size=(img_size//4), + # dilate=replace_stride_with_dilation[1]) + # self.layer4 = self._make_layer(block, int(1024 * s), layers[3], stride=2, kernel_size=(img_size//8), + # dilate=replace_stride_with_dilation[2]) + + # Decoder + # self.decoder1 = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) + # self.decoder2 = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) + # self.decoder3 = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) + self.decoder4 = nn.Conv2d(int(512*s) , int(256*s), kernel_size=3, stride=1, padding=1) + self.decoder5 = nn.Conv2d(int(256*s) , int(128*s) , kernel_size=3, stride=1, padding=1) + self.adjust = nn.Conv2d(int(128*s) , num_classes, kernel_size=1, stride=1, padding=0) + self.soft = nn.Softmax(dim=1) + + # for m in self.modules(): + # if isinstance(m, (nn.Conv2d, nn.Conv1d)): + # if isinstance(m, qkv_transform): + # pass + # else: + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + # elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d, nn.GroupNorm)): + # nn.init.constant_(m.weight, 1) + # nn.init.constant_(m.bias, 0) + + # Zero-initialize the last BN in each residual branch, + # so that the residual branch starts with zeros, and each residual block behaves like an identity. + # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 + # if zero_init_residual: + # for m in self.modules(): + # if isinstance(m, AxialBlock): + # nn.init.constant_(m.bn2.weight, 0) + + def _make_layer(self, block, planes, blocks, kernel_size=56, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, groups=self.groups, + base_width=self.base_width, dilation=previous_dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + self.inplanes = planes * block.expansion + if stride != 1: + kernel_size = kernel_size // 2 + + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + + return nn.Sequential(*layers) + + def _forward_impl(self, x): + # See note [TorchScript super()] + # AxialAttention Encoder + # pdb.set_trace() + x = self.conv1(x) + x = self.bn1(x) + # x = F.max_pool2d(x,2,2) + x = self.relu(x) + + # x = self.maxpool(x) + # pdb.set_trace() + x1 = self.layer1(x) + # print(x1.shape) + x2 = self.layer2(x1) + # print(x2.shape) + # x3 = self.layer3(x2) + # print(x3.shape) + # x4 = self.layer4(x3) + # print(x4.shape) + # pdb.set_trace() + # Transposed Convolution Decoder + # x = F.relu(F.interpolate(self.decoder1(x4), scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x4) + # x = F.relu(F.interpolate(self.decoder2(x) , scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x3) + # x = F.relu(F.interpolate(self.decoder3(x2) , scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x2) + x = F.relu(F.interpolate(self.decoder4(x2) , scale_factor=(2,2), mode ='bilinear')) + x = torch.add(x, x1) + x = F.relu(F.interpolate(self.decoder5(x1) , scale_factor=(2,2), mode ='bilinear')) + x = self.soft(self.adjust(F.relu(x))) + # pdb.set_trace() + return x + + def forward(self, x): + return self._forward_impl(x) + +class autoencoder(nn.Module): + def __init__(self): + super(autoencoder, self).__init__() + + + self.encoder1 = nn.Conv2d(3, 64, 3, stride=1, padding=1) # b, 16, 10, 10 + self.encoder2= nn.Conv2d(64, 128, 3, stride=1, padding=1) # b, 8, 3, 3 + self.encoder3= nn.Conv2d(128, 256, 3, stride=1, padding=1) + self.encoder4= nn.Conv2d(256, 512, 3, stride=1, padding=1) + self.encoder5= nn.Conv2d(512, 1024, 3, stride=1, padding=1) + + self.decoder1 = nn.Conv2d(1024, 512, 3, stride=1,padding=2) # b, 16, 5, 5 + self.decoder2 = nn.Conv2d(512, 256, 3, stride=1, padding=2) # b, 8, 15, 1 + self.decoder3 = nn.Conv2d(256, 128, 3, stride=1, padding=1) # b, 1, 28, 28 + self.decoder4 = nn.Conv2d(128, 64, 3, stride=1, padding=1) + self.decoder5 = nn.Conv2d(64, 2, 3, stride=1, padding=1) + + self.soft = nn.Softmax(dim =1) + + def forward(self, x): + + out = F.relu(F.max_pool2d(self.encoder1(x),2,2)) + out = F.relu(F.max_pool2d(self.encoder2(out),2,2)) + out = F.relu(F.max_pool2d(self.encoder3(out),2,2)) + + out = F.relu(F.interpolate(self.decoder3(out),scale_factor=(2,2),mode ='bilinear')) + + out = F.relu(F.interpolate(self.decoder4(out),scale_factor=(2,2),mode ='bilinear')) + + out = F.relu(F.interpolate(self.decoder5(out),scale_factor=(2,2),mode ='bilinear')) + # print(out.shape) + out = self.soft(out) + return out + + +def axial26s(pretrained=False, **kwargs): + model = AxialAttentionNet(AxialBlock, [1, 2, 4, 1], s=0.5, **kwargs) + return model + + +def axial50s(pretrained=False, **kwargs): + model = AxialAttentionNet(AxialBlock, [3, 4, 6, 3], s=0.5, **kwargs) + return model + + +def axial50m(pretrained=False, **kwargs): + model = AxialAttentionNet(AxialBlock, [3, 4, 6, 3], s=0.75, **kwargs) + return model + + +def axial50l(pretrained=False, **kwargs): + model = AxialAttentionNet(AxialBlock, [3, 4, 6, 3], s=1, **kwargs) + return model + + +def resxialunet128s(pretrained=False, **kwargs): + model = ResAxialAttentionUNet(AxialBlock_dynamic, [1, 2, 4, 1], s= 0.125,img_size = 128, imgchan =1, **kwargs) + return model + +def resaxialunet_dyn(pretrained=False, **kwargs): + model = ResAxialAttentionUNet(AxialBlock_dynamic, [1, 2, 4, 1], s= 0.125,img_size = 512, imgchan =3, **kwargs) + return model + +def resxialunet_wopos(pretrained=False, **kwargs): + model = ResAxialAttentionUNet(AxialBlock_wopos, [1, 2, 4, 1], s= 0.125,img_size = 128, imgchan =3, **kwargs) + return model + +def resunet(pretrained=False, **kwargs): + model = ResAxialAttentionUNet(AxialBlockmod, [1, 2, 4, 1], s= 0.125, img_size = 128, imgchan = 1, **kwargs) + return model + +def unetplusplus(pretrained=False, **kwargs): + model = unetplus(AxialBlockmod, [1, 2, 4, 1], s= 0.125, img_size = 128, imgchan = 3, **kwargs) + return model + +def mix_net(pretrained=False, **kwargs): + model = mix(AxialBlock_dynamic, [1, 2, 4, 1], s= 0.125, img_size = 128, imgchan = 3, **kwargs) + return model + +def mix_net_512(pretrained=False, **kwargs): + model = mix_512(AxialBlock, [1, 2, 4, 1], s= 0.125, img_size = 512, imgchan = 3, **kwargs) + return model +def mix_net_gated_d(pretrained=False, **kwargs): + model = mix(AxialBlock_gated_data, [1, 2, 4, 1], s= 0.125, img_size = 128, imgchan = 1, **kwargs) + return model + +def mix_net_wopos(pretrained=False, **kwargs): + model = mix_wopos(AxialBlock_dynamic,AxialBlock_wopos, [1, 2, 4, 1], s= 0.125, img_size = 128, imgchan = 1, **kwargs) + return model + +def mix_net_wopos_512(pretrained=False, **kwargs): + model = mix_wopos_512(AxialBlock,AxialBlock_wopos, [1, 2, 4, 1], s= 0.125, img_size = 512, imgchan = 3, **kwargs) + return model +# def resunet_wopos(pretrained=False, **kwargs): +# model = ResAxialAttentionUNet(AxialBlockmod_wopos, [1, 2, 4, 1], s= 0.125, img_size = 32, imgchan = 3, **kwargs) +# return model + +def resxialunet128s_shallow(pretrained=False, **kwargs): + model = ResAxialAttentionUNetshallow(AxialBlockmod, [1, 2, 4, 1], s= 0.125,img_size = 128, imgchan =3, **kwargs) + return model +# EOF \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/myaxialnet.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/myaxialnet.py new file mode 100644 index 0000000000000000000000000000000000000000..67a16efacc0006798edbf26da9ddd4bbd47034bb --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/myaxialnet.py @@ -0,0 +1,1298 @@ +import pdb +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +from .utils import * +import pdb +import matplotlib.pyplot as plt + +import random + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution""" + return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) + + +class AxialBlock(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None, kernel_size=56): + super(AxialBlock, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + width = int(planes * (base_width / 64.)) + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv_down = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.hight_block = AxialAttention(width, width, groups=groups, kernel_size=kernel_size) + self.width_block = AxialAttention(width, width, groups=groups, kernel_size=kernel_size, stride=stride, width=True) + self.conv_up = conv1x1(width, planes * self.expansion) + self.bn2 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv_down(x) # 下采样 inplanes -> width + out = self.bn1(out) + out = self.relu(out) + # print(out.shape) + out = self.hight_block(out) + out = self.width_block(out) + out = self.relu(out) + + out = self.conv_up(out) # width -> planes*2(expansion) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + + + +class AxialAttention(nn.Module): + def __init__(self, in_planes, out_planes, groups=8, kernel_size=56, + stride=1, bias=False, width=False): + assert (in_planes % groups == 0) and (out_planes % groups == 0) + super(AxialAttention, self).__init__() + self.in_planes = in_planes + self.out_planes = out_planes + self.groups = groups + self.group_planes = out_planes // groups + self.kernel_size = kernel_size + self.stride = stride + self.bias = bias + self.width = width + + # Multi-head self attention + self.qkv_transform = qkv_transform(in_planes, out_planes * 2, kernel_size=1, stride=1, + padding=0, bias=False) + self.bn_qkv = nn.BatchNorm1d(out_planes * 2) + self.bn_similarity = nn.BatchNorm2d(groups * 3) + + self.bn_output = nn.BatchNorm1d(out_planes * 2) + + # Position embedding + self.relative = nn.Parameter(torch.randn(self.group_planes * 2, kernel_size * 2 - 1), requires_grad=True) + query_index = torch.arange(kernel_size).unsqueeze(0) + key_index = torch.arange(kernel_size).unsqueeze(1) + relative_index = key_index - query_index + kernel_size - 1 + self.register_buffer('flatten_index', relative_index.view(-1)) + if stride > 1: + self.pooling = nn.AvgPool2d(stride, stride=stride) + + self.reset_parameters() + + def forward(self, x): + # pdb.set_trace() + if self.width: + x = x.permute(0, 2, 1, 3) + else: + x = x.permute(0, 3, 1, 2) # N, W, C, H + N, W, C, H = x.shape + + # print('N: ', x.shape[0]) # 1 layer2相同 1 layer4与3相同 1 6与5相同 + # print('W: ', x.shape[1]) # 56 56 28 28 14 14 7 + # print('C: ', x.shape[2]) # 16 32 32 64 64 128 256 + # print('H: ', x.shape[3]) # 56 56 28 28 14 14 7 + x = x.contiguous().view(N * W, C, H) # 56, 16, 56 56, 32, 56 28 32 28 + + # Transformations + qkv = self.bn_qkv(self.qkv_transform(x)) + q, k, v = torch.split(qkv.reshape(N * W, self.groups, self.group_planes * 2, H), + [self.group_planes // 2, self.group_planes // 2, self.group_planes], dim=2) + + # Calculate position embedding + all_embeddings = torch.index_select(self.relative, 1, self.flatten_index).view(self.group_planes * 2, + self.kernel_size, + self.kernel_size) + # torch.index_select(input, dim, index, out=None) 函数返回的是沿着输入张量的指定维度的指定索引号进行索引的张量子集 + q_embedding, k_embedding, v_embedding = torch.split(all_embeddings, + [self.group_planes // 2, self.group_planes // 2, + self.group_planes], dim=0) + + # print('group: ', self.groups) # 8 + # print('in_planes: ', self.in_planes) # 16 32 32 + # print('out_planes: ', self.out_planes) # 16 32 32 + # print('group_planes: ', self.group_planes) # 2 4 4 + # print('all embedding: ', all_embeddings.shape) # [4, 56, 56] [8, 56, 56] [8,28,28] + # print('q_embedding: ', q_embedding.shape) # [1, 56, 56] [2, 56, 56] [2,28,28] + # print('qkv: ', qkv.shape) # 56, 32, 56 -> 56, 8, 4, 56 56, 64, 56 -> 56, 8, 8, 56 28, 64, 28 -> 28, 8, 8, 28 + # print('q: ', q.shape) # [56, 8, 1, 56] [56, 8, 2, 56] [28,8,2,28] + # print('relative.shape: ', self.relative.shape) # [4,111] [8, 111] [8, 55] + # print('flatten_index.shape: ', self.flatten_index.shape) # [3136] [3136] [784] + + qr = torch.einsum('bgci,cij->bgij', q, q_embedding) + kr = torch.einsum('bgci,cij->bgij', k, k_embedding).transpose(2, 3) + + qk = torch.einsum('bgci, bgcj->bgij', q, k) + + stacked_similarity = torch.cat([qk, qr, kr], dim=1) + stacked_similarity = self.bn_similarity(stacked_similarity).view(N * W, 3, self.groups, H, H).sum(dim=1) + # stacked_similarity = self.bn_qr(qr) + self.bn_kr(kr) + self.bn_qk(qk) + # (N, groups, H, H, W) + similarity = F.softmax(stacked_similarity, dim=3) + sv = torch.einsum('bgij,bgcj->bgci', similarity, v) + sve = torch.einsum('bgij,cij->bgci', similarity, v_embedding) + stacked_output = torch.cat([sv, sve], dim=-1).view(N * W, self.out_planes * 2, H) + output = self.bn_output(stacked_output).view(N, W, self.out_planes, 2, H).sum(dim=-2) + + if self.width: + output = output.permute(0, 2, 1, 3) + else: + output = output.permute(0, 2, 3, 1) + + if self.stride > 1: + output = self.pooling(output) + + return output + + def reset_parameters(self): + self.qkv_transform.weight.data.normal_(0, math.sqrt(1. / self.in_planes)) + # nn.init.uniform_(self.relative, -0.1, 0.1) + nn.init.normal_(self.relative, 0., math.sqrt(1. / self.group_planes)) + + +# class medt_net(nn.Module): +# +# def __init__(self, block, block_2, layers, num_classes=2, zero_init_residual=True, +# groups=8, width_per_group=64, replace_stride_with_dilation=None, +# norm_layer=None, s=0.125, img_size=128, imgchan=3): +# super(medt_net, self).__init__() +# if norm_layer is None: +# norm_layer = nn.BatchNorm2d +# self._norm_layer = norm_layer +# +# self.inplanes = int(64 * s) # 64*0.125=8 +# self.dilation = 1 +# if replace_stride_with_dilation is None: +# replace_stride_with_dilation = [False, False, False] +# if len(replace_stride_with_dilation) != 3: +# raise ValueError("replace_stride_with_dilation should be None " +# "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) +# self.groups = groups # 8 +# self.base_width = width_per_group # 64 +# self.conv1 = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, +# bias=False) # (h-7+6/2)+1=h/2 +# self.conv2 = nn.Conv2d(self.inplanes, 128, kernel_size=3, stride=1, padding=1, bias=False) # 尺寸不变 +# self.conv3 = nn.Conv2d(128, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) # 尺寸不变 +# self.conv4 = nn.Conv2d(self.inplanes, self.inplanes*2, kernel_size=3, stride=1, padding=1, bias=False) +# +# self.bn1 = norm_layer(self.inplanes) +# self.bn2 = norm_layer(128) +# self.bn3 = norm_layer(self.inplanes) +# self.bn4 = norm_layer(self.inplanes*2) +# +# # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) +# self.bn1 = norm_layer(self.inplanes) +# self.relu = nn.ReLU(inplace=True) +# self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) +# self.layer1 = self._make_layer(block, int(128 * s), layers[0], kernel_size=(img_size // 2)) +# self.layer2 = self._make_layer(block, int(256 * s), layers[1], stride=2, kernel_size=(img_size // 2), +# dilate=replace_stride_with_dilation[0]) +# # self.layer3 = self._make_layer(block, int(512 * s), layers[2], stride=2, kernel_size=(img_size//4), +# # dilate=replace_stride_with_dilation[1]) +# # self.layer4 = self._make_layer(block, int(1024 * s), layers[3], stride=2, kernel_size=(img_size//8), +# # dilate=replace_stride_with_dilation[2]) +# +# # Decoder +# # self.decoder1 = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) +# # self.decoder2 = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) +# # self.decoder3 = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) +# self.decoder4 = nn.Conv2d(int(512 * s), int(256 * s), kernel_size=3, stride=1, padding=1) +# self.decoder5 = nn.Conv2d(int(256 * s), int(128 * s), kernel_size=3, stride=1, padding=1) +# self.adjust = nn.Conv2d(int(128 * s), num_classes, kernel_size=1, stride=1, padding=0) +# self.soft = nn.Softmax(dim=1) +# +# self.conv1_p = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, +# bias=False) +# self.conv2_p = nn.Conv2d(self.inplanes, 128, kernel_size=3, stride=1, padding=1, +# bias=False) +# self.conv3_p = nn.Conv2d(128, self.inplanes, kernel_size=3, stride=1, padding=1, +# bias=False) +# # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) +# self.bn1_p = norm_layer(self.inplanes) +# self.bn2_p = norm_layer(128) +# self.bn3_p = norm_layer(self.inplanes) +# +# self.relu_p = nn.ReLU(inplace=True) +# +# img_size_p = img_size // 4 +# +# self.layer1_p = self._make_layer(block_2, int(128 * s), layers[0], kernel_size=(img_size_p // 2)) +# self.layer2_p = self._make_layer(block_2, int(256 * s), layers[1], stride=2, kernel_size=(img_size_p // 2), +# dilate=replace_stride_with_dilation[0]) +# self.layer3_p = self._make_layer(block_2, int(512 * s), layers[2], stride=2, kernel_size=(img_size_p // 4), +# dilate=replace_stride_with_dilation[1]) +# self.layer4_p = self._make_layer(block_2, int(1024 * s), layers[3], stride=2, kernel_size=(img_size_p // 8), +# dilate=replace_stride_with_dilation[2]) +# +# # Decoder +# self.decoder1_p = nn.Conv2d(int(1024 * 2 * s), int(1024 * 2 * s), kernel_size=3, stride=2, padding=1) +# self.decoder2_p = nn.Conv2d(int(1024 * 2 * s), int(1024 * s), kernel_size=3, stride=1, padding=1) +# self.decoder3_p = nn.Conv2d(int(1024 * s), int(512 * s), kernel_size=3, stride=1, padding=1) +# self.decoder4_p = nn.Conv2d(int(512 * s), int(256 * s), kernel_size=3, stride=1, padding=1) +# self.decoder5_p = nn.Conv2d(int(256 * s), int(128 * s), kernel_size=3, stride=1, padding=1) +# +# self.decoderf = nn.Conv2d(int(128 * s), int(128 * s), kernel_size=3, stride=1, padding=1) +# self.adjust_p = nn.Conv2d(int(128 * s), num_classes, kernel_size=1, stride=1, padding=0) +# self.soft_p = nn.Softmax(dim=1) +# +# def _make_layer(self, block, planes, blocks, kernel_size=56, stride=1, dilate=False): +# norm_layer = self._norm_layer +# downsample = None +# previous_dilation = self.dilation +# if dilate: +# self.dilation *= stride +# stride = 1 +# if stride != 1 or self.inplanes != planes * block.expansion: +# downsample = nn.Sequential( +# conv1x1(self.inplanes, planes * block.expansion, stride), +# norm_layer(planes * block.expansion), +# ) +# +# layers = [] +# layers.append(block(self.inplanes, planes, stride, downsample, groups=self.groups, +# base_width=self.base_width, dilation=previous_dilation, +# norm_layer=norm_layer, kernel_size=kernel_size)) +# self.inplanes = planes * block.expansion +# if stride != 1: +# kernel_size = kernel_size // 2 +# +# for _ in range(1, blocks): +# layers.append(block(self.inplanes, planes, groups=self.groups, +# base_width=self.base_width, dilation=self.dilation, +# norm_layer=norm_layer, kernel_size=kernel_size)) +# +# return nn.Sequential(*layers) +# +# def _forward_impl(self, x): +# +# xin_s = x.clone() +# xin_m = x.clone() +# xin_l = x.clone() +# +# xin = x.clone() +# +# +# x = self.conv1(x) # 3-> inplanes +# x = self.bn1(x) +# x = self.relu(x) +# x = self.conv2(x) # inplanes -> 128 +# x = self.bn2(x) +# x = self.relu(x) +# x = self.conv3(x) # 128 -> inplanes +# x = self.bn3(x) +# # x = F.max_pool2d(x,2,2) +# x = self.relu(x) +# +# x = self.conv4(x) +# x = self.bn4(x) +# x = self.relu(x) +# x = F.interpolate(x, scale_factor=(2, 2), mode='bilinear') +# print('x: ', x.shape) # [1, 8, 128, 128] +# +# '''# x = self.maxpool(x) +# # pdb.set_trace() +# x1 = self.layer1(x) # inplanes -> 128*s*2 inplanes在layers里面会乘以2 inplanes变为 planes*2(expansion) +# print('layer1: ', x1.shape) # [1, 32, 128, 128] +# x2 = self.layer2(x1) # 128*s*2 -> 256*s*2 inplances:256*s->256*s*2 +# print('layer2: ', x2.shape) # [1, 64, 64, 64] +# # x3 = self.layer3(x2) +# # # print(x3.shape) +# # x4 = self.layer4(x3) +# # # print(x4.shape) +# # x = F.relu(F.interpolate(self.decoder1(x4), scale_factor=(2,2), mode ='bilinear')) +# # x = torch.add(x, x4) +# # x = F.relu(F.interpolate(self.decoder2(x4) , scale_factor=(2,2), mode ='bilinear')) +# # x = torch.add(x, x3) +# # x = F.relu(F.interpolate(self.decoder3(x3) , scale_factor=(2,2), mode ='bilinear')) +# # x = torch.add(x, x2) +# x = F.relu(F.interpolate(self.decoder4(x2), scale_factor=(2, 2), mode='bilinear')) +# x = torch.add(x, x1) +# x = F.relu(F.interpolate(self.decoder5(x), scale_factor=(2, 2), mode='bilinear')) +# print(x.shape) # [1, 16, 256, 256]''' +# +# +# +# # 到这将全图片输入进行了两层transformer,每层都有残差连接。 +# # +# # end of full image training +# +# # y_out = torch.ones((1,2,128,128)) +# x_loc_s = x.clone() +# x_loc_m = x.clone() +# x_loc_l = x.clone() +# +# # x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) +# # start +# h_s = xin_s.shape[2] +# w_s = xin_s.shape[3] +# print('w_s: ', w_s) # 256 +# print('h_s: ', h_s) # 256 +# i_start = 0 +# i_end = 0 +# j_start = 0 +# j_end = 0 +# for i in range(0, h_s): +# for j in range(0, w_s): +# if i < h_s//8: +# if j < w_s//8: +# i_start = 0 +# i_end = h_s//4 +# j_start = 0 +# j_end = w_s//4 +# x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] +# elif j >= w_s*7//8-1: +# i_start = 0 +# i_end = h_s // 4 +# j_start = w_s*3//4 +# j_end = w_s +# x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] +# else: +# i_start = 0 +# i_end = h_s // 4 +# j_start = j-w_s//8 +# j_end = j+w_s//8 +# x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] +# +# elif i >= h_s*7//8-1: +# if j < w_s//8: +# i_start = h_s*3//4 +# i_end = h_s +# j_start = 0 +# j_end = w_s//4 +# x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] +# +# elif j >= w_s*7//8-1: +# i_start = h_s * 3 // 4 +# i_end = h_s +# j_start = w_s*3//4 +# j_end = w_s +# x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] +# +# else: +# i_start = h_s * 3 // 4 +# i_end = h_s +# j_start = j-w_s//8 +# j_end = j+w_s//8 +# x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] +# else: +# if j < w_s//8: +# i_start = i-h_s//8 +# i_end = i+h_s//8 +# j_start = 0 +# j_end = w_s//4 +# x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] +# +# elif j >= w_s*7//8-1: +# i_start = i-h_s//8 +# i_end = i+h_s//8 +# j_start = w_s*3//4 +# j_end = w_s +# x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] +# else: +# i_start = i-h_s//8 +# i_end = i+h_s//8 +# j_start = j - w_s//8 +# j_end = j + w_s//8 +# x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] +# print('x_p_s patch: ', x_p_s.shape) # [1, 3, 64, 64] 256/4跨度为H/4 +# print('inplans:', self.inplanes) # 256 +# +# x_p_s = self.conv1_p(x_p_s) +# print('conv1_p shape: ', x_p_s.shape) # [1, 64, 32, 32] stride=2 +# x_p_s = self.bn1_p(x_p_s) +# x_p_s = self.relu(x_p_s) +# +# x_p_s = self.conv2_p(x_p_s) +# print('conv2_p shape: ', x_p_s.shape) # [1, 128, 32, 32] +# x_p_s = self.bn2_p(x_p_s) +# x_p_s = self.relu(x_p_s) +# +# x_p_s = self.conv3_p(x_p_s) +# print('conv3_p shape: ', x_p_s.shape) # [1, 64, 32, 32] +# x_p_s = self.bn3_p(x_p_s) +# x_p_s = self.relu(x_p_s) +# +# x1_p_s = self.layer1_p(x_p_s) +# print('layer1_p shape: ', x1_p_s.shape) # [1, 32, 32, 32] +# x2_p_s = self.layer2_p(x1_p_s) +# print('layer2_p shape: ', x2_p_s.shape) # [1, 64, 16, 16] +# x3_p_s = self.layer3_p(x2_p_s) +# print('layer3_p shape: ', x3_p_s.shape) # [1, 128, 8, 8] +# x4_p_s = self.layer4_p(x3_p_s) +# print('x4_p_s shape: ', x4_p_s.shape) # [1, 256, 4, 4] +# +# x_p_s = F.relu(F.interpolate(self.decoder1_p(x4_p_s), scale_factor=(2, 2), mode='bilinear')) +# print('x_p_s shape: ', x_p_s.shape) # [1, 256, 4, 4] +# x_p_s = torch.add(x_p_s, x4_p_s) +# x_p_s = F.relu(F.interpolate(self.decoder2_p(x_p_s), scale_factor=(2, 2), mode='bilinear')) +# print('x_p_s shape: ', x_p_s.shape) # [1, 128, 8, 8] +# x_p_s = torch.add(x_p_s, x3_p_s) +# x_p_s = F.relu(F.interpolate(self.decoder3_p(x_p_s), scale_factor=(2, 2), mode='bilinear')) +# print('x_p_s shape: ', x_p_s.shape) # [1, 64, 16, 16] +# x_p_s = torch.add(x_p_s, x2_p_s) +# x_p_s = F.relu(F.interpolate(self.decoder4_p(x_p_s), scale_factor=(2, 2), mode='bilinear')) +# print('x_p_s shape: ', x_p_s.shape) # [1, 32, 32, 32] +# x_p_s = torch.add(x_p_s, x1_p_s) +# x_p_s = F.relu(F.interpolate(self.decoder5_p(x_p_s), scale_factor=(2, 2), mode='bilinear')) +# print('x_p_s shape: ', x_p_s.shape) # [1, 16, 64, 64] +# x_loc_s[:, :, i_start:i_end, j_start:j_end] = x_p_s +# print('i,j: ', i, j) +# +# +# +# # x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) +# # start +# +# xin_m = self.maxpool(xin_m) +# h_m = xin_m.shape[2] +# w_m = xin_m.shape[3] +# print('h_m: ', h_m) +# print('h_m: ', w_m) +# i_m_start = 0 +# i_m_end = 0 +# j_m_start = 0 +# j_m_end = 0 +# for i in range(0, h_m): +# for j in range(0, w_m): +# if i < h_m // 4: +# if j < w_m // 4: +# i_m_start = 0 +# i_m_end = h_m // 2 +# j_m_start = 0 +# j_m_end = w_m // 2 +# x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] +# elif j >= w_m * 3 // 4 - 1: +# i_m_start = 0 +# i_m_end = h_m // 2 +# j_m_start = w_m * 1 // 2 +# j_m_end = w_m +# x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] +# else: +# i_m_start = 0 +# i_m_end = h_m // 2 +# j_m_start = j - w_m//4 +# j_m_end = j + w_m//4 +# x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] +# +# elif i >= h_m * 3 // 4 - 1: +# if j < w_m // 4: +# i_m_start = h_m * 1 // 2 +# i_m_end = h_m +# j_m_start = 0 +# j_m_end = w_m // 2 +# x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] +# +# elif j >= w_m * 3 // 4 - 1: +# i_m_start = h_m * 1 // 2 +# i_m_end = h_m +# j_m_start = w_m * 1 // 2 +# j_m_end = w_m +# x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] +# +# else: +# i_m_start = h_m * 1 // 2 +# i_m_end = h_m +# j_m_start = j - w_m//4 +# j_m_end = j + w_m//4 +# x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] +# else: +# if j < w_m // 4: +# i_m_start = i - h_m//4 +# i_m_end = i + h_m//4 +# j_m_start = 0 +# j_m_end = w_m // 2 +# x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] +# +# elif j >= w_m * 3 // 4 - 1: +# i_m_start = i - h_m//4 +# i_m_end = i + h_m//4 +# j_m_start = w_m * 1 // 2 +# j_m_end = w_m +# x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] +# else: +# i_m_start = i - h_m//4 +# i_m_end = i + h_m//4 +# j_m_start = j - w_m//4 +# j_m_end = j + w_m//4 +# x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] +# print('x_p_m patch: ', x_p_m.shape) # [1, 3, 64, 64] 256/4跨度为H/4 +# x_p_m = self.conv1_p(x_p_m) +# x_p_m = self.bn1_p(x_p_m) +# x_p_m = self.relu(x_p_m) +# +# x_p_m = self.conv2_p(x_p_m) +# x_p_m = self.bn2_p(x_p_m) +# x_p_m = self.relu(x_p_m) +# +# x_p_m = self.conv3_p(x_p_m) +# x_p_m = self.bn3_p(x_p_m) +# x_p_m = self.relu(x_p_m) +# +# x1_p_m = self.layer1_p(x_p_m) +# x2_p_m = self.layer2_p(x1_p_m) +# # print(x2.shape) +# x3_p_m = self.layer3_p(x2_p_m) +# # # print(x3.shape) +# x4_p_m = self.layer4_p(x3_p_m) +# +# x_p_m = F.relu(F.interpolate(self.decoder1_p(x4_p_m), scale_factor=(2, 2), mode='bilinear')) +# x_p_m = torch.add(x_p_m, x4_p_m) +# x_p_m = F.relu(F.interpolate(self.decoder2_p(x_p_m), scale_factor=(2, 2), mode='bilinear')) +# x_p_m = torch.add(x_p_m, x3_p_m) +# x_p_m = F.relu(F.interpolate(self.decoder3_p(x_p_m), scale_factor=(2, 2), mode='bilinear')) +# x_p_m = torch.add(x_p_m, x2_p_m) +# x_p_m = F.relu(F.interpolate(self.decoder4_p(x_p_m), scale_factor=(2, 2), mode='bilinear')) +# x_p_m = torch.add(x_p_m, x1_p_m) +# x_p_m = F.relu(F.interpolate(self.decoder5_p(x_p_m), scale_factor=(2, 2), mode='bilinear')) +# x_loc_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] = x_p_m +# print('i, j: ', i, j) +# x_loc_m = F.interpolate(x_loc_m, scale_factor=(2, 2), mode='bilinear') # 上采样 +# +# +# +# xin_l = self.maxpool(xin_l) +# xin_l = self.maxpool(xin_l) +# h_l = xin_l.shape[2] +# w_l = xin_l.shape[3] +# i_l_start = 0 +# i_l_end = 0 +# j_l_start = 0 +# j_l_end = 0 +# for i in range(0, h_l): +# for j in range(0, w_l): +# i_l_start = 0 +# i_l_end = h_l +# j_l_start = 0 +# j_l_end = w_l +# x_p_l = xin_l[:, :, i_l_start:i_l_end, j_l_start:j_l_end] +# x_p_l = self.conv1_p(x_p_l) +# x_p_l = self.bn1_p(x_p_l) +# x_p_l = self.relu(x_p_l) +# +# x_p_l = self.conv2_p(x_p_l) +# x_p_l = self.bn2_p(x_p_l) +# x_p_l = self.relu(x_p_l) +# +# x_p_l = self.conv3_p(x_p_l) +# x_p_l = self.bn3_p(x_p_l) +# x_p_l = self.relu(x_p_l) +# +# x1_p_l = self.layer1_p(x_p_l) +# x2_p_l = self.layer2_p(x1_p_l) +# # print(x2.shape) +# x3_p_l = self.layer3_p(x2_p_l) +# # # print(x3.shape) +# x4_p_l = self.layer4_p(x3_p_l) +# +# x_p_l = F.relu(F.interpolate(self.decoder1_p(x4_p_l), scale_factor=(2, 2), mode='bilinear')) +# x_p_l = torch.add(x_p_l, x4_p_l) +# x_p_l = F.relu(F.interpolate(self.decoder2_p(x_p_l), scale_factor=(2, 2), mode='bilinear')) +# x_p_l = torch.add(x_p_l, x3_p_l) +# x_p_l = F.relu(F.interpolate(self.decoder3_p(x_p_l), scale_factor=(2, 2), mode='bilinear')) +# x_p_l = torch.add(x_p_l, x2_p_l) +# x_p_l = F.relu(F.interpolate(self.decoder4_p(x_p_l), scale_factor=(2, 2), mode='bilinear')) +# x_p_l = torch.add(x_p_l, x1_p_l) +# x_p_l = F.relu(F.interpolate(self.decoder5_p(x_p_l), scale_factor=(2, 2), mode='bilinear')) +# x_loc_l[:, :, i_l_start:i_l_end, j_l_start:j_l_end] = x_p_l +# x_loc_l = F.interpolate(x_loc_l, scale_factor=(2, 2), mode='bilinear') # 上采样 +# x_loc_l = F.interpolate(x_loc_l, scale_factor=(2, 2), mode='bilinear') # 上采样 +# # print('x_loc_s.shape: ', x_loc_s.shape) # [1, 3, 256, 256] +# +# '''x_loc = x.clone() +# for i in range(0, 4): +# for j in range(0, 4): +# x_p = xin[:, :, 64 * i:64 * (i + 1), 64 * j:64 * (j + 1)] +# print('x_p shape: ', x_p.shape) # [1, 3, 32, 32] +# # begin patch wise +# x_p = self.conv1_p(x_p) # imgchans-> inplans +# x_p = self.bn1_p(x_p) +# # x = F.max_pool2d(x,2,2) +# x_p = self.relu(x_p) +# +# x_p = self.conv2_p(x_p) +# x_p = self.bn2_p(x_p) +# # x = F.max_pool2d(x,2,2) +# x_p = self.relu(x_p) +# x_p = self.conv3_p(x_p) +# x_p = self.bn3_p(x_p) +# # x = F.max_pool2d(x,2,2) +# x_p = self.relu(x_p) +# +# # x = self.maxpool(x) +# # pdb.set_trace() +# x1_p = self.layer1_p(x_p) +# print('x1_p shape: ', x1_p.shape) +# x2_p = self.layer2_p(x1_p) +# print('x2_p shape: ', x2_p.shape) +# x3_p = self.layer3_p(x2_p) +# print('x3_p shape: ', x3_p.shape) +# x4_p = self.layer4_p(x3_p) +# +# x_p = F.relu(F.interpolate(self.decoder1_p(x4_p), scale_factor=(2, 2), mode='bilinear')) +# print('x_p shape: ', x_p.shape) +# x_p = torch.add(x_p, x4_p) +# x_p = F.relu(F.interpolate(self.decoder2_p(x_p), scale_factor=(2, 2), mode='bilinear')) +# x_p = torch.add(x_p, x3_p) +# x_p = F.relu(F.interpolate(self.decoder3_p(x_p), scale_factor=(2, 2), mode='bilinear')) +# x_p = torch.add(x_p, x2_p) +# x_p = F.relu(F.interpolate(self.decoder4_p(x_p), scale_factor=(2, 2), mode='bilinear')) +# x_p = torch.add(x_p, x1_p) +# x_p = F.relu(F.interpolate(self.decoder5_p(x_p), scale_factor=(2, 2), mode='bilinear')) +# print('x_p shape: ', x_p.shape) +# print('i,j: ', i, j) +# +# x_loc[:, :, 64 * i:64 * (i + 1), 64 * j:64 * (j + 1)] = x_p''' +# # 长城短程皆一样的操作过程,还未更改 +# x_out = torch.add(x_loc_s, x_loc_l, x_loc_m) # 三个就是尺寸统一后相加 +# x_out = torch.add(x_loc_s, x_loc_l, x_loc_m) # 三个就是尺寸统一后相加 +# x_out = F.relu(self.decoderf(x_out)) +# +# x_out = self.adjust(F.relu(x_out)) +# '''x = torch.add(x, x_loc) +# x = F.relu(self.decoderf(x)) # 128*s->128*s +# +# x = self.adjust(F.relu(x)) # 128*s -> classes''' +# +# # pdb.set_trace() +# # return x +# # pdb.set_trace() +# return x +# +# def forward(self, x): +# return self._forward_impl(x) +class medt_net(nn.Module): + + def __init__(self, block, block_2, layers, num_classes=2, zero_init_residual=True, + groups=8, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None, s=0.125, img_size=128, imgchan=3): + super(medt_net, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = int(64 * s) # 64*0.125=8 + self.dilation = 1 + if replace_stride_with_dilation is None: + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups # 8 + self.base_width = width_per_group # 64 + self.conv1 = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) # (h-7+6/2)+1=h/2 + self.conv2 = nn.Conv2d(self.inplanes, 128, kernel_size=3, stride=1, padding=1, bias=False) # 尺寸不变 + self.conv3 = nn.Conv2d(128, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) # 尺寸不变 + self.conv4 = nn.Conv2d(self.inplanes, self.inplanes*2, kernel_size=3, stride=1, padding=1, bias=False) + + self.bn1 = norm_layer(self.inplanes) + self.bn2 = norm_layer(128) + self.bn3 = norm_layer(self.inplanes) + self.bn4 = norm_layer(self.inplanes*2) + + # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, int(128 * s), layers[0], kernel_size=(img_size // 2)) + self.layer2 = self._make_layer(block, int(256 * s), layers[1], stride=2, kernel_size=(img_size // 2), + dilate=replace_stride_with_dilation[0]) + # self.layer3 = self._make_layer(block, int(512 * s), layers[2], stride=2, kernel_size=(img_size//4), + # dilate=replace_stride_with_dilation[1]) + # self.layer4 = self._make_layer(block, int(1024 * s), layers[3], stride=2, kernel_size=(img_size//8), + # dilate=replace_stride_with_dilation[2]) + + # Decoder + # self.decoder1 = nn.Conv2d(int(1024 *2*s) , int(1024*2*s), kernel_size=3, stride=2, padding=1) + # self.decoder2 = nn.Conv2d(int(1024 *2*s) , int(1024*s), kernel_size=3, stride=1, padding=1) + # self.decoder3 = nn.Conv2d(int(1024*s), int(512*s), kernel_size=3, stride=1, padding=1) + self.decoder4 = nn.Conv2d(int(512 * s), int(256 * s), kernel_size=3, stride=1, padding=1) + self.decoder5 = nn.Conv2d(int(256 * s), int(128 * s), kernel_size=3, stride=1, padding=1) + self.adjust = nn.Conv2d(int(128 * s), num_classes, kernel_size=1, stride=1, padding=0) + self.soft = nn.Softmax(dim=1) + + self.conv1_p = nn.Conv2d(imgchan, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + self.conv2_p = nn.Conv2d(self.inplanes, 128, kernel_size=3, stride=1, padding=1, + bias=False) + self.conv3_p = nn.Conv2d(128, self.inplanes, kernel_size=3, stride=1, padding=1, + bias=False) + # self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1_p = norm_layer(self.inplanes) + self.bn2_p = norm_layer(128) + self.bn3_p = norm_layer(self.inplanes) + + self.relu_p = nn.ReLU(inplace=True) + + img_size_p = img_size // 4 + + self.layer1_p = self._make_layer(block_2, int(128 * s), layers[0], kernel_size=(img_size_p // 2)) + self.layer2_p = self._make_layer(block_2, int(256 * s), layers[1], stride=2, kernel_size=(img_size_p // 2), + dilate=replace_stride_with_dilation[0]) + self.layer3_p = self._make_layer(block_2, int(512 * s), layers[2], stride=2, kernel_size=(img_size_p // 4), + dilate=replace_stride_with_dilation[1]) + self.layer4_p = self._make_layer(block_2, int(1024 * s), layers[3], stride=2, kernel_size=(img_size_p // 8), + dilate=replace_stride_with_dilation[2]) + + # Decoder + self.decoder1_p = nn.Conv2d(int(1024 * 2 * s), int(1024 * 2 * s), kernel_size=3, stride=2, padding=1) + self.decoder2_p = nn.Conv2d(int(1024 * 2 * s), int(1024 * s), kernel_size=3, stride=1, padding=1) + self.decoder3_p = nn.Conv2d(int(1024 * s), int(512 * s), kernel_size=3, stride=1, padding=1) + self.decoder4_p = nn.Conv2d(int(512 * s), int(256 * s), kernel_size=3, stride=1, padding=1) + self.decoder5_p = nn.Conv2d(int(256 * s), int(128 * s), kernel_size=3, stride=1, padding=1) + + self.decoderf = nn.Conv2d(int(128 * s), int(128 * s), kernel_size=3, stride=1, padding=1) + self.adjust_p = nn.Conv2d(int(128 * s), num_classes, kernel_size=1, stride=1, padding=0) + self.soft_p = nn.Softmax(dim=1) + + def _make_layer(self, block, planes, blocks, kernel_size=56, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, groups=self.groups, + base_width=self.base_width, dilation=previous_dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + self.inplanes = planes * block.expansion + if stride != 1: + kernel_size = kernel_size // 2 + + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer, kernel_size=kernel_size)) + + return nn.Sequential(*layers) + + def _forward_impl(self, x): + + xin_s = x.clone() + xin_m = x.clone() + xin_l = x.clone() + + xin = x.clone() + + + x = self.conv1(x) # 3-> inplanes + x = self.bn1(x) + x = self.relu(x) + x = self.conv2(x) # inplanes -> 128 + x = self.bn2(x) + x = self.relu(x) + x = self.conv3(x) # 128 -> inplanes + x = self.bn3(x) + # x = F.max_pool2d(x,2,2) + x = self.relu(x) + + x = self.conv4(x) + x = self.bn4(x) + x = self.relu(x) + x = F.interpolate(x, scale_factor=(2, 2), mode='bilinear') + # print('x: ', x.shape) # [1, 8, 128, 128] + + '''# x = self.maxpool(x) + # pdb.set_trace() + x1 = self.layer1(x) # inplanes -> 128*s*2 inplanes在layers里面会乘以2 inplanes变为 planes*2(expansion) + print('layer1: ', x1.shape) # [1, 32, 128, 128] + x2 = self.layer2(x1) # 128*s*2 -> 256*s*2 inplances:256*s->256*s*2 + print('layer2: ', x2.shape) # [1, 64, 64, 64] + # x3 = self.layer3(x2) + # # print(x3.shape) + # x4 = self.layer4(x3) + # # print(x4.shape) + # x = F.relu(F.interpolate(self.decoder1(x4), scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x4) + # x = F.relu(F.interpolate(self.decoder2(x4) , scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x3) + # x = F.relu(F.interpolate(self.decoder3(x3) , scale_factor=(2,2), mode ='bilinear')) + # x = torch.add(x, x2) + x = F.relu(F.interpolate(self.decoder4(x2), scale_factor=(2, 2), mode='bilinear')) + x = torch.add(x, x1) + x = F.relu(F.interpolate(self.decoder5(x), scale_factor=(2, 2), mode='bilinear')) + print(x.shape) # [1, 16, 256, 256]''' + + + + # # 到这将全图片输入进行了两层transformer,每层都有残差连接。 + # # + # # end of full image training + # + # # y_out = torch.ones((1,2,128,128)) + # x_loc_s = x.clone() + # x_loc_m = x.clone() + # x_loc_l = x.clone() + # + # # x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) + # # start + # h_s = xin_s.shape[2] + # w_s = xin_s.shape[3] + # print('w_s: ', w_s) # 256 + # print('h_s: ', h_s) # 256 + # i_start = 0 + # i_end = 0 + # j_start = 0 + # j_end = 0 + # for i in range(0, h_s): + # for j in range(0, w_s): + # if i < h_s//8: + # if j < w_s//8: + # i_start = 0 + # i_end = h_s//4 + # j_start = 0 + # j_end = w_s//4 + # x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] + # elif j >= w_s*7//8-1: + # i_start = 0 + # i_end = h_s // 4 + # j_start = w_s*3//4 + # j_end = w_s + # x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] + # else: + # i_start = 0 + # i_end = h_s // 4 + # j_start = j-w_s//8 + # j_end = j+w_s//8 + # x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] + # + # elif i >= h_s*7//8-1: + # if j < w_s//8: + # i_start = h_s*3//4 + # i_end = h_s + # j_start = 0 + # j_end = w_s//4 + # x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] + # + # elif j >= w_s*7//8-1: + # i_start = h_s * 3 // 4 + # i_end = h_s + # j_start = w_s*3//4 + # j_end = w_s + # x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] + # + # else: + # i_start = h_s * 3 // 4 + # i_end = h_s + # j_start = j-w_s//8 + # j_end = j+w_s//8 + # x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] + # else: + # if j < w_s//8: + # i_start = i-h_s//8 + # i_end = i+h_s//8 + # j_start = 0 + # j_end = w_s//4 + # x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] + # + # elif j >= w_s*7//8-1: + # i_start = i-h_s//8 + # i_end = i+h_s//8 + # j_start = w_s*3//4 + # j_end = w_s + # x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] + # else: + # i_start = i-h_s//8 + # i_end = i+h_s//8 + # j_start = j - w_s//8 + # j_end = j + w_s//8 + # x_p_s = xin_s[:, :, i_start:i_end, j_start:j_end] + # print('x_p_s patch: ', x_p_s.shape) # [1, 3, 64, 64] 256/4跨度为H/4 + # print('inplans:', self.inplanes) # 256 + # + # x_p_s = self.conv1_p(x_p_s) + # print('conv1_p shape: ', x_p_s.shape) # [1, 64, 32, 32] stride=2 + # x_p_s = self.bn1_p(x_p_s) + # x_p_s = self.relu(x_p_s) + # + # x_p_s = self.conv2_p(x_p_s) + # print('conv2_p shape: ', x_p_s.shape) # [1, 128, 32, 32] + # x_p_s = self.bn2_p(x_p_s) + # x_p_s = self.relu(x_p_s) + # + # x_p_s = self.conv3_p(x_p_s) + # print('conv3_p shape: ', x_p_s.shape) # [1, 64, 32, 32] + # x_p_s = self.bn3_p(x_p_s) + # x_p_s = self.relu(x_p_s) + # + # x1_p_s = self.layer1_p(x_p_s) + # print('layer1_p shape: ', x1_p_s.shape) # [1, 32, 32, 32] + # x2_p_s = self.layer2_p(x1_p_s) + # print('layer2_p shape: ', x2_p_s.shape) # [1, 64, 16, 16] + # x3_p_s = self.layer3_p(x2_p_s) + # print('layer3_p shape: ', x3_p_s.shape) # [1, 128, 8, 8] + # x4_p_s = self.layer4_p(x3_p_s) + # print('x4_p_s shape: ', x4_p_s.shape) # [1, 256, 4, 4] + # + # x_p_s = F.relu(F.interpolate(self.decoder1_p(x4_p_s), scale_factor=(2, 2), mode='bilinear')) + # print('x_p_s shape: ', x_p_s.shape) # [1, 256, 4, 4] + # x_p_s = torch.add(x_p_s, x4_p_s) + # x_p_s = F.relu(F.interpolate(self.decoder2_p(x_p_s), scale_factor=(2, 2), mode='bilinear')) + # print('x_p_s shape: ', x_p_s.shape) # [1, 128, 8, 8] + # x_p_s = torch.add(x_p_s, x3_p_s) + # x_p_s = F.relu(F.interpolate(self.decoder3_p(x_p_s), scale_factor=(2, 2), mode='bilinear')) + # print('x_p_s shape: ', x_p_s.shape) # [1, 64, 16, 16] + # x_p_s = torch.add(x_p_s, x2_p_s) + # x_p_s = F.relu(F.interpolate(self.decoder4_p(x_p_s), scale_factor=(2, 2), mode='bilinear')) + # print('x_p_s shape: ', x_p_s.shape) # [1, 32, 32, 32] + # x_p_s = torch.add(x_p_s, x1_p_s) + # x_p_s = F.relu(F.interpolate(self.decoder5_p(x_p_s), scale_factor=(2, 2), mode='bilinear')) + # print('x_p_s shape: ', x_p_s.shape) # [1, 16, 64, 64] + # x_loc_s[:, :, i_start:i_end, j_start:j_end] = x_p_s + # print('i,j: ', i, j) + # + # + # + # # x = F.relu(F.interpolate(self.decoder5(x) , scale_factor=(2,2), mode ='bilinear')) + # # start + # + # xin_m = self.maxpool(xin_m) + # h_m = xin_m.shape[2] + # w_m = xin_m.shape[3] + # print('h_m: ', h_m) + # print('h_m: ', w_m) + # i_m_start = 0 + # i_m_end = 0 + # j_m_start = 0 + # j_m_end = 0 + # for i in range(0, h_m): + # for j in range(0, w_m): + # if i < h_m // 4: + # if j < w_m // 4: + # i_m_start = 0 + # i_m_end = h_m // 2 + # j_m_start = 0 + # j_m_end = w_m // 2 + # x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] + # elif j >= w_m * 3 // 4 - 1: + # i_m_start = 0 + # i_m_end = h_m // 2 + # j_m_start = w_m * 1 // 2 + # j_m_end = w_m + # x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] + # else: + # i_m_start = 0 + # i_m_end = h_m // 2 + # j_m_start = j - w_m//4 + # j_m_end = j + w_m//4 + # x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] + # + # elif i >= h_m * 3 // 4 - 1: + # if j < w_m // 4: + # i_m_start = h_m * 1 // 2 + # i_m_end = h_m + # j_m_start = 0 + # j_m_end = w_m // 2 + # x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] + # + # elif j >= w_m * 3 // 4 - 1: + # i_m_start = h_m * 1 // 2 + # i_m_end = h_m + # j_m_start = w_m * 1 // 2 + # j_m_end = w_m + # x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] + # + # else: + # i_m_start = h_m * 1 // 2 + # i_m_end = h_m + # j_m_start = j - w_m//4 + # j_m_end = j + w_m//4 + # x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] + # else: + # if j < w_m // 4: + # i_m_start = i - h_m//4 + # i_m_end = i + h_m//4 + # j_m_start = 0 + # j_m_end = w_m // 2 + # x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] + # + # elif j >= w_m * 3 // 4 - 1: + # i_m_start = i - h_m//4 + # i_m_end = i + h_m//4 + # j_m_start = w_m * 1 // 2 + # j_m_end = w_m + # x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] + # else: + # i_m_start = i - h_m//4 + # i_m_end = i + h_m//4 + # j_m_start = j - w_m//4 + # j_m_end = j + w_m//4 + # x_p_m = xin_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] + # print('x_p_m patch: ', x_p_m.shape) # [1, 3, 64, 64] 256/4跨度为H/4 + # x_p_m = self.conv1_p(x_p_m) + # x_p_m = self.bn1_p(x_p_m) + # x_p_m = self.relu(x_p_m) + # + # x_p_m = self.conv2_p(x_p_m) + # x_p_m = self.bn2_p(x_p_m) + # x_p_m = self.relu(x_p_m) + # + # x_p_m = self.conv3_p(x_p_m) + # x_p_m = self.bn3_p(x_p_m) + # x_p_m = self.relu(x_p_m) + # + # x1_p_m = self.layer1_p(x_p_m) + # x2_p_m = self.layer2_p(x1_p_m) + # # print(x2.shape) + # x3_p_m = self.layer3_p(x2_p_m) + # # # print(x3.shape) + # x4_p_m = self.layer4_p(x3_p_m) + # + # x_p_m = F.relu(F.interpolate(self.decoder1_p(x4_p_m), scale_factor=(2, 2), mode='bilinear')) + # x_p_m = torch.add(x_p_m, x4_p_m) + # x_p_m = F.relu(F.interpolate(self.decoder2_p(x_p_m), scale_factor=(2, 2), mode='bilinear')) + # x_p_m = torch.add(x_p_m, x3_p_m) + # x_p_m = F.relu(F.interpolate(self.decoder3_p(x_p_m), scale_factor=(2, 2), mode='bilinear')) + # x_p_m = torch.add(x_p_m, x2_p_m) + # x_p_m = F.relu(F.interpolate(self.decoder4_p(x_p_m), scale_factor=(2, 2), mode='bilinear')) + # x_p_m = torch.add(x_p_m, x1_p_m) + # x_p_m = F.relu(F.interpolate(self.decoder5_p(x_p_m), scale_factor=(2, 2), mode='bilinear')) + # x_loc_m[:, :, i_m_start:i_m_end, j_m_start:j_m_end] = x_p_m + # print('i, j: ', i, j) + # x_loc_m = F.interpolate(x_loc_m, scale_factor=(2, 2), mode='bilinear') # 上采样 + # + # + # + # xin_l = self.maxpool(xin_l) + # xin_l = self.maxpool(xin_l) + # h_l = xin_l.shape[2] + # w_l = xin_l.shape[3] + # i_l_start = 0 + # i_l_end = 0 + # j_l_start = 0 + # j_l_end = 0 + # for i in range(0, h_l): + # for j in range(0, w_l): + # i_l_start = 0 + # i_l_end = h_l + # j_l_start = 0 + # j_l_end = w_l + # x_p_l = xin_l[:, :, i_l_start:i_l_end, j_l_start:j_l_end] + # x_p_l = self.conv1_p(x_p_l) + # x_p_l = self.bn1_p(x_p_l) + # x_p_l = self.relu(x_p_l) + # + # x_p_l = self.conv2_p(x_p_l) + # x_p_l = self.bn2_p(x_p_l) + # x_p_l = self.relu(x_p_l) + # + # x_p_l = self.conv3_p(x_p_l) + # x_p_l = self.bn3_p(x_p_l) + # x_p_l = self.relu(x_p_l) + # + # x1_p_l = self.layer1_p(x_p_l) + # x2_p_l = self.layer2_p(x1_p_l) + # # print(x2.shape) + # x3_p_l = self.layer3_p(x2_p_l) + # # # print(x3.shape) + # x4_p_l = self.layer4_p(x3_p_l) + # + # x_p_l = F.relu(F.interpolate(self.decoder1_p(x4_p_l), scale_factor=(2, 2), mode='bilinear')) + # x_p_l = torch.add(x_p_l, x4_p_l) + # x_p_l = F.relu(F.interpolate(self.decoder2_p(x_p_l), scale_factor=(2, 2), mode='bilinear')) + # x_p_l = torch.add(x_p_l, x3_p_l) + # x_p_l = F.relu(F.interpolate(self.decoder3_p(x_p_l), scale_factor=(2, 2), mode='bilinear')) + # x_p_l = torch.add(x_p_l, x2_p_l) + # x_p_l = F.relu(F.interpolate(self.decoder4_p(x_p_l), scale_factor=(2, 2), mode='bilinear')) + # x_p_l = torch.add(x_p_l, x1_p_l) + # x_p_l = F.relu(F.interpolate(self.decoder5_p(x_p_l), scale_factor=(2, 2), mode='bilinear')) + # x_loc_l[:, :, i_l_start:i_l_end, j_l_start:j_l_end] = x_p_l + # x_loc_l = F.interpolate(x_loc_l, scale_factor=(2, 2), mode='bilinear') # 上采样 + # x_loc_l = F.interpolate(x_loc_l, scale_factor=(2, 2), mode='bilinear') # 上采样 + # # print('x_loc_s.shape: ', x_loc_s.shape) # [1, 3, 256, 256] + + x_loc_s = x.clone() + x_loc_l = x.clone() + x_loc_m = x.clone() + + h_1 = x_loc_s.shape[2] + w_1 = x_loc_s.shape[3] + for i in range(0, 4): + for j in range(0, 4): + x_p_s_1 = xin_s[:, :, h_1//4 * i:h_1//4 * (i + 1), w_1//4 * j:w_1//4 * (j + 1)] + + # print('x_p shape: ', x_p.shape) # [1, 3, 32, 32] + # begin patch wise + x_p_s_1 = self.conv1_p(x_p_s_1) # imgchans-> inplans + x_p_s_1 = self.bn1_p(x_p_s_1) + # x = F.max_pool2d(x,2,2) + x_p_s_1 = self.relu(x_p_s_1) + + x_p_s_1 = self.conv2_p(x_p_s_1) + x_p_s_1 = self.bn2_p(x_p_s_1) + # x = F.max_pool2d(x,2,2) + x_p_s_1 = self.relu(x_p_s_1) + x_p_s_1 = self.conv3_p(x_p_s_1) + x_p_s_1 = self.bn3_p(x_p_s_1) + # x = F.max_pool2d(x,2,2) + x_p_s_1 = self.relu(x_p_s_1) + + # x = self.maxpool(x) + # pdb.set_trace() + x1_p_s_1 = self.layer1_p(x_p_s_1) + # print('x1_p shape: ', x1_p_s_1.shape) + x2_p_s_1 = self.layer2_p(x1_p_s_1) + # print('x2_p shape: ', x2_p_s_1.shape) + x3_p_s_1 = self.layer3_p(x2_p_s_1) + # print('x3_p shape: ', x3_p_s_1.shape) + x4_p_s_1 = self.layer4_p(x3_p_s_1) + + x_p_s_1 = F.relu(F.interpolate(self.decoder1_p(x4_p_s_1), scale_factor=(2, 2), mode='bilinear')) + # print('x_p shape: ', x_p_s_1.shape) + x_p_s_1 = torch.add(x_p_s_1, x4_p_s_1) + x_p_s_1 = F.relu(F.interpolate(self.decoder2_p(x_p_s_1), scale_factor=(2, 2), mode='bilinear')) + x_p_s_1 = torch.add(x_p_s_1, x3_p_s_1) + x_p_s_1 = F.relu(F.interpolate(self.decoder3_p(x_p_s_1), scale_factor=(2, 2), mode='bilinear')) + x_p_s_1 = torch.add(x_p_s_1, x2_p_s_1) + x_p_s_1 = F.relu(F.interpolate(self.decoder4_p(x_p_s_1), scale_factor=(2, 2), mode='bilinear')) + x_p_s_1 = torch.add(x_p_s_1, x1_p_s_1) + x_p_s_1 = F.relu(F.interpolate(self.decoder5_p(x_p_s_1), scale_factor=(2, 2), mode='bilinear')) + # print('x_p shape: ', x_p_s_1.shape) + # print('i,j: ', i, j) + + x_loc_s[:, :, h_1//4 * i:h_1//4 * (i + 1), w_1//4 * j:w_1//4 * (j + 1)] = x_p_s_1 + + x_loc_m = self.maxpool(x_loc_m) + # print('x_loc_m: ', x_loc_m.shape) + xin_m = self.maxpool(xin_m) + h_m = x_loc_m.shape[2] + w_m = x_loc_m.shape[3] + for i in range(0, 2): + for j in range(0, 2): + x_p_m_1 = xin_m[:, :, h_m // 2 * i:h_m // 2 * (i + 1), w_m // 2 * j:w_m//2 * (j + 1)] + # 取patch需要在整幅图,然后替代回去,所以xin_m应当是下采样过后的,之后在插值回去融合 + # print('x_p shape: ', x_p.shape) # [1, 3, 32, 32] + # begin patch wise + x_p_m_1 = self.conv1_p(x_p_m_1) # imgchans-> inplans + x_p_m_1 = self.bn1_p(x_p_m_1) + # x = F.max_pool2d(x,2,2) + x_p_m_1 = self.relu(x_p_m_1) + + x_p_m_1 = self.conv2_p(x_p_m_1) + x_p_m_1 = self.bn2_p(x_p_m_1) + # x = F.max_pool2d(x,2,2) + x_p_m_1 = self.relu(x_p_m_1) + x_p_m_1 = self.conv3_p(x_p_m_1) + x_p_m_1 = self.bn3_p(x_p_m_1) + # x = F.max_pool2d(x,2,2) + x_p_m_1 = self.relu(x_p_m_1) + + # x = self.maxpool(x) + # pdb.set_trace() + x1_p_m_1 = self.layer1_p(x_p_m_1) + # print('x1_p shape: ', x1_p_m_1.shape) + x2_p_m_1 = self.layer2_p(x1_p_m_1) + # print('x2_p shape: ', x2_p_m_1.shape) + x3_p_m_1 = self.layer3_p(x2_p_m_1) + # print('x3_p shape: ', x3_p_m_1.shape) + x4_p_m_1 = self.layer4_p(x3_p_m_1) + + x_p_m_1 = F.relu(F.interpolate(self.decoder1_p(x4_p_m_1), scale_factor=(2, 2), mode='bilinear')) + # print('x_p shape: ', x_p_m_1.shape) + x_p_m_1 = torch.add(x_p_m_1, x4_p_m_1) + x_p_m_1 = F.relu(F.interpolate(self.decoder2_p(x_p_m_1), scale_factor=(2, 2), mode='bilinear')) + x_p_m_1 = torch.add(x_p_m_1, x3_p_m_1) + x_p_m_1 = F.relu(F.interpolate(self.decoder3_p(x_p_m_1), scale_factor=(2, 2), mode='bilinear')) + x_p_m_1 = torch.add(x_p_m_1, x2_p_m_1) + x_p_m_1 = F.relu(F.interpolate(self.decoder4_p(x_p_m_1), scale_factor=(2, 2), mode='bilinear')) + x_p_m_1 = torch.add(x_p_m_1, x1_p_m_1) + x_p_m_1 = F.relu(F.interpolate(self.decoder5_p(x_p_m_1), scale_factor=(2, 2), mode='bilinear')) + # print('x_p shape: ', x_p_m_1.shape) + # print('i,j: ', i, j) + + x_loc_m[:, :, h_m // 2 * i:h_m // 2 * (i + 1), w_m // 2 * j:w_m//2 * (j + 1)] = x_p_m_1 + x_loc_m = F.interpolate(x_loc_m, scale_factor=(2, 2), mode='bilinear') + + xin_l = self.maxpool(xin_l) + xin_l = self.maxpool(xin_l) # xin_l 满足patch尺寸一致 + x_loc_l = self.maxpool(x_loc_l) # x_loc_l 满足channel一致 + x_loc_l = self.maxpool(x_loc_l) + # print('x_loc_l: ', x_loc_l.shape) + h_l = x_loc_l.shape[2] + w_l = x_loc_l.shape[3] + + x_p_l_1 = xin_l[:, :, :, :] + + # print('x_p shape: ', x_p.shape) # [1, 3, 32, 32] + # begin patch wise + x_p_l_1 = self.conv1_p(x_p_l_1) # imgchans-> inplans + x_p_l_1 = self.bn1_p(x_p_l_1) + # x = F.max_pool2d(x,2,2) + x_p_l_1 = self.relu(x_p_l_1) + + x_p_l_1 = self.conv2_p(x_p_l_1) + x_p_l_1 = self.bn2_p(x_p_l_1) + # x = F.max_pool2d(x,2,2) + x_p_l_1 = self.relu(x_p_l_1) + x_p_l_1 = self.conv3_p(x_p_l_1) + x_p_l_1 = self.bn3_p(x_p_l_1) + # x = F.max_pool2d(x,2,2) + x_p_l_1 = self.relu(x_p_l_1) + + # x = self.maxpool(x) + # pdb.set_trace() + x1_p_l_1 = self.layer1_p(x_p_l_1) + # print('x1_p shape: ', x1_p_l_1.shape) + x2_p_l_1 = self.layer2_p(x1_p_l_1) + # print('x2_p shape: ', x2_p_l_1.shape) + x3_p_l_1 = self.layer3_p(x2_p_l_1) + # print('x3_p shape: ', x3_p_l_1.shape) + x4_p_l_1 = self.layer4_p(x3_p_l_1) + + x_p_l_1 = F.relu(F.interpolate(self.decoder1_p(x4_p_l_1), scale_factor=(2, 2), mode='bilinear')) + # print('x_p shape: ', x_p_l_1.shape) + x_p_l_1 = torch.add(x_p_l_1, x4_p_l_1) + x_p_l_1 = F.relu(F.interpolate(self.decoder2_p(x_p_l_1), scale_factor=(2, 2), mode='bilinear')) + x_p_l_1 = torch.add(x_p_l_1, x3_p_l_1) + x_p_l_1 = F.relu(F.interpolate(self.decoder3_p(x_p_l_1), scale_factor=(2, 2), mode='bilinear')) + x_p_l_1 = torch.add(x_p_l_1, x2_p_l_1) + x_p_l_1 = F.relu(F.interpolate(self.decoder4_p(x_p_l_1), scale_factor=(2, 2), mode='bilinear')) + x_p_l_1 = torch.add(x_p_l_1, x1_p_l_1) + x_p_l_1 = F.relu(F.interpolate(self.decoder5_p(x_p_l_1), scale_factor=(2, 2), mode='bilinear')) + # print('x_p shape: ', x_p_l_1.shape) + # print('i,j: ', i, j) + + x_loc_l[:, :, :, :] = x_p_l_1 + x_loc_l = F.interpolate(x_loc_l, scale_factor=(2, 2), mode='bilinear') + x_loc_l = F.interpolate(x_loc_l, scale_factor=(2, 2), mode='bilinear') + + # 长城短程皆一样的操作过程,还未更改 + '''x_out = torch.add(x_loc_s, x_loc_l, x_loc_m) # 三个就是尺寸统一后相加 + x_out = torch.add(x_loc_s, x_loc_l, x_loc_m) # 三个就是尺寸统一后相加 + x_out = F.relu(self.decoderf(x_out)) + + x_out = self.adjust(F.relu(x_out))''' + x = torch.add(x_loc_m, x_loc_l) + x = torch.add(x, x_loc_s) + x = F.relu(self.decoderf(x)) # 128*s->128*s + + x = self.adjust(F.relu(x)) # 128*s -> classes + + # pdb.set_trace() + # return x + # pdb.set_trace() + return x + + def forward(self, x): + return self._forward_impl(x) +def mylogo(pretrained=False, **kwargs): + model = medt_net(AxialBlock, AxialBlock, [1, 2, 4, 1], s= 0.125, **kwargs) + return model \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/resnet.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..2017ce5f18a0078da879eb2d788faa9b2bfd43df --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/resnet.py @@ -0,0 +1,287 @@ +import torch +import torch.nn as nn + + +__all__ = ['ResNet', 'resnet26', 'resnet18', 'resnet34', 'resnet50', 'resnet101', + 'resnet152',] + + +model_urls = { + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', + 'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth', + 'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth', + 'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth', + 'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth', +} + + +def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=dilation, groups=groups, bias=False, dilation=dilation) + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution""" + return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None): + super(BasicBlock, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + if groups != 1 or base_width != 64: + raise ValueError('BasicBlock only supports groups=1 and base_width=64') + if dilation > 1: + raise NotImplementedError("Dilation > 1 not supported in BasicBlock") + # Both self.conv1 and self.downsample layers downsample the input when stride != 1 + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = norm_layer(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = norm_layer(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2) + # while original implementation places the stride at the first 1x1 convolution(self.conv1) + # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385. + # This variant is also known as ResNet V1.5 and improves accuracy according to + # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch. + + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None): + super(Bottleneck, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + width = int(planes * (base_width / 64.)) * groups + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv1 = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.conv2 = conv3x3(width, width, stride, groups, dilation) + self.bn2 = norm_layer(width) + self.conv3 = conv1x1(width, planes * self.expansion) + self.bn3 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + + +class ResNet(nn.Module): + + def __init__(self, block, layers, num_classes=1000, zero_init_residual=False, + groups=1, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None): + super(ResNet, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = 64 + self.dilation = 1 + if replace_stride_with_dilation is None: + # each element in the tuple indicates if we should replace + # the 2x2 stride with a dilated convolution instead + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2, + dilate=replace_stride_with_dilation[0]) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2, + dilate=replace_stride_with_dilation[1]) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2, + dilate=replace_stride_with_dilation[2]) + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(512 * block.expansion, num_classes) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + # Zero-initialize the last BN in each residual branch, + # so that the residual branch starts with zeros, and each residual block behaves like an identity. + # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 + if zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck): + nn.init.constant_(m.bn3.weight, 0) + elif isinstance(m, BasicBlock): + nn.init.constant_(m.bn2.weight, 0) + + def _make_layer(self, block, planes, blocks, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, self.groups, + self.base_width, previous_dilation, norm_layer)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer)) + + return nn.Sequential(*layers) + + def _forward_impl(self, x): + # See note [TorchScript super()] + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.fc(x) + + return x + + def forward(self, x): + return self._forward_impl(x) + + +def _resnet(arch, block, layers, pretrained, progress, **kwargs): + model = ResNet(block, layers, **kwargs) + if pretrained: + state_dict = load_state_dict_from_url(model_urls[arch], + progress=progress) + model.load_state_dict(state_dict) + return model + + +def resnet18(pretrained=False, progress=True, **kwargs): + r"""ResNet-18 model from + `"Deep Residual Learning for Image Recognition" `_ + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress, + **kwargs) + + +def resnet34(pretrained=False, progress=True, **kwargs): + r"""ResNet-34 model from + `"Deep Residual Learning for Image Recognition" `_ + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress, + **kwargs) + + +def resnet26(pretrained=False, progress=True, **kwargs): + return _resnet('resnet26', Bottleneck, [1, 2, 4, 1], pretrained, progress, + **kwargs) + + +def resnet50(pretrained=False, progress=True, **kwargs): + r"""ResNet-50 model from + `"Deep Residual Learning for Image Recognition" `_ + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress, + **kwargs) + + +def resnet101(pretrained=False, progress=True, **kwargs): + r"""ResNet-101 model from + `"Deep Residual Learning for Image Recognition" `_ + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress, + **kwargs) + + +def resnet152(pretrained=False, progress=True, **kwargs): + r"""ResNet-152 model from + `"Deep Residual Learning for Image Recognition" `_ + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress, + **kwargs) diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/utils.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f0ac0f62c2fa7a98bf56d98b9a9168eb0f9a15fe --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/models/utils.py @@ -0,0 +1,6 @@ +import torch.nn as nn + + +class qkv_transform(nn.Conv1d): + """Conv1d for qkv_transform""" + diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/utils.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e6093bdaed50a81a88ea6a9d3d1842ad60378f04 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/lib/utils.py @@ -0,0 +1,167 @@ +import math +import os +import torch +import torch.nn.functional as F + + +def adjust_learning_rate(args, optimizer, epoch, batch_idx, data_nums, type="cosine"): + if epoch < args.warmup_epochs: + epoch += float(batch_idx + 1) / data_nums + lr_adj = 1. * (epoch / args.warmup_epochs) + elif type == "linear": + if epoch < 30 + args.warmup_epochs: + lr_adj = 1. + elif epoch < 60 + args.warmup_epochs: + lr_adj = 1e-1 + elif epoch < 90 + args.warmup_epochs: + lr_adj = 1e-2 + else: + lr_adj = 1e-3 + elif type == "cosine": + run_epochs = epoch - args.warmup_epochs + total_epochs = args.epochs - args.warmup_epochs + T_cur = float(run_epochs * data_nums) + batch_idx + T_total = float(total_epochs * data_nums) + + lr_adj = 0.5 * (1 + math.cos(math.pi * T_cur / T_total)) + + for param_group in optimizer.param_groups: + param_group['lr'] = args.lr * lr_adj + return args.lr * lr_adj + + +def label_smoothing(pred, target, eta=0.1): + ''' + Refer from https://arxiv.org/pdf/1512.00567.pdf + :param target: N, + :param n_classes: int + :param eta: float + :return: + N x C onehot smoothed vector + ''' + n_classes = pred.size(1) + target = torch.unsqueeze(target, 1) + onehot_target = torch.zeros_like(pred) + onehot_target.scatter_(1, target, 1) + return onehot_target * (1 - eta) + eta / n_classes * 1 + + +def cross_entropy_for_onehot(pred, target): + return torch.mean(torch.sum(- target * F.log_softmax(pred, dim=-1), 1)) + + +def cross_entropy_with_label_smoothing(pred, target, eta=0.1): + onehot_target = label_smoothing(pred, target, eta=eta) + return cross_entropy_for_onehot(pred, onehot_target) + + +def accuracy(output, target): + # get the index of the max log-probability + pred = output.max(1, keepdim=True)[1] + return pred.eq(target.view_as(pred)).cpu().float().mean() + + +def save_model(model, optimizer, epoch, args): + os.system('mkdir -p {}'.format(args.work_dirs)) + if optimizer is not None: + torch.save({ + 'net': model.state_dict(), + 'optim': optimizer.state_dict(), + 'epoch': epoch + }, os.path.join(args.work_dirs, '{}.pth'.format(epoch))) + else: + torch.save({ + 'net': model.state_dict(), + 'epoch': epoch + }, os.path.join(args.work_dirs, '{}.pth'.format(epoch))) + + +def dist_save_model(model, optimizer, epoch, ngpus_per_node, args): + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + os.system('mkdir -p {}'.format(args.work_dirs)) + if optimizer is not None: + torch.save({ + 'net': model.state_dict(), + 'optim': optimizer.state_dict(), + 'epoch': epoch + }, os.path.join(args.work_dirs, '{}.pth'.format(epoch))) + else: + torch.save({ + 'net': model.state_dict(), + 'epoch': epoch + }, os.path.join(args.work_dirs, '{}.pth'.format(epoch))) + + +def load_model(network, args): + if not os.path.exists(args.work_dirs): + print("No such working directory!") + raise AssertionError + + pths = [pth.split('.')[0] for pth in os.listdir(args.work_dirs) if 'pth' in pth] + if len(pths) == 0: + print("No model to load!") + raise AssertionError + + pths = [int(pth) for pth in pths] + if args.test_model == -1: + pth = -1 + if pth in pths: + pass + else: + pth = max(pths) + else: + pth = args.test_model + try: + if args.distributed: + loc = 'cuda:{}'.format(args.gpu) + model = torch.load(os.path.join(args.work_dirs, '{}.pth'.format(pth)), map_location=loc) + except: + model = torch.load(os.path.join(args.work_dirs, '{}.pth'.format(pth))) + try: + network.load_state_dict(model['net'], strict=True) + except: + network.load_state_dict(convert_model(model['net']), strict=True) + return True + + +def resume_model(network, optimizer, args): + print("Loading the model...") + if not os.path.exists(args.work_dirs): + print("No such working directory!") + return 0 + pths = [pth.split('.')[0] for pth in os.listdir(args.work_dirs) if 'pth' in pth] + if len(pths) == 0: + print("No model to load!") + return 0 + pths = [int(pth) for pth in pths] + if args.test_model == -1: + pth = max(pths) + else: + pth = args.test_model + try: + if args.distributed: + loc = 'cuda:{}'.format(args.gpu) + model = torch.load(os.path.join(args.work_dirs, '{}.pth'.format(pth)), map_location=loc) + except: + model = torch.load(os.path.join(args.work_dirs, '{}.pth'.format(pth))) + try: + network.load_state_dict(model['net'], strict=True) + except: + network.load_state_dict(convert_model(model['net']), strict=True) + optimizer.load_state_dict(model['optim']) + for state in optimizer.state.values(): + for k, v in state.items(): + if torch.is_tensor(v): + try: + state[k] = v.cuda(args.gpu) + except: + state[k] = v.cuda() + return model['epoch'] + + +def convert_model(model): + new_model = {} + for k in model.keys(): + new_model[k[7:]] = model[k] + return new_model diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/metrics.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..676be8dea1c639e48b8d9e6d31c7a931ce613458 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/metrics.py @@ -0,0 +1,235 @@ +import torch +from torch.nn.functional import cross_entropy +from torch.nn.modules.loss import _WeightedLoss + +import numpy as np +import torch.nn as nn + + +EPSILON = 1e-32 + + +class LogNLLLoss(_WeightedLoss): + __constants__ = ['weight', 'reduction', 'ignore_index'] + + def __init__(self, weight=None, size_average=None, reduce=None, reduction=None, + ignore_index=-100): + super(LogNLLLoss, self).__init__(weight, size_average, reduce, reduction) + self.ignore_index = ignore_index + + def forward(self, y_input, y_target): + # y_input = torch.log(y_input + EPSILON) + return cross_entropy(y_input, y_target, weight=self.weight, + ignore_index=self.ignore_index) + + +def classwise_iou(output, gt): + """ + Args: + output: torch.Tensor of shape (n_batch, n_classes, image.shape) + gt: torch.LongTensor of shape (n_batch, image.shape) + """ + dims = (0, *range(2, len(output.shape))) + gt = torch.zeros_like(output).scatter_(1, gt[:, None, :], 1) + intersection = output*gt + union = output + gt - intersection + classwise_iou = (intersection.sum(dim=dims).float() + EPSILON) / (union.sum(dim=dims) + EPSILON) + + return classwise_iou + + +def classwise_f1(output, gt): + """ + Args: + output: torch.Tensor of shape (n_batch, n_classes, image.shape) + gt: torch.LongTensor of shape (n_batch, image.shape) + """ + + epsilon = 1e-20 + n_classes = output.shape[1] + + output = torch.argmax(output, dim=1) + true_positives = torch.tensor([((output == i) * (gt == i)).sum() for i in range(n_classes)]).float() + selected = torch.tensor([(output == i).sum() for i in range(n_classes)]).float() + relevant = torch.tensor([(gt == i).sum() for i in range(n_classes)]).float() + + precision = (true_positives + epsilon) / (selected + epsilon) + recall = (true_positives + epsilon) / (relevant + epsilon) + classwise_f1 = 2 * (precision * recall) / (precision + recall) + + return classwise_f1 + + +def make_weighted_metric(classwise_metric): + """ + Args: + classwise_metric: classwise metric like classwise_IOU or classwise_F1 + """ + + def weighted_metric(output, gt, weights=None): + + # dimensions to sum over + dims = (0, *range(2, len(output.shape))) + + # default weights + if weights == None: + weights = torch.ones(output.shape[1]) / output.shape[1] + else: + # creating tensor if needed + if len(weights) != output.shape[1]: + raise ValueError("The number of weights must match with the number of classes") + if not isinstance(weights, torch.Tensor): + weights = torch.tensor(weights) + # normalizing weights + weights /= torch.sum(weights) + + classwise_scores = classwise_metric(output, gt).cpu() + + return classwise_scores + + return weighted_metric + + +jaccard_index = make_weighted_metric(classwise_iou) +f1_score = make_weighted_metric(classwise_f1) + + +class BinaryMetrics(): + """ + Compute common metrics for binary segmentation, including overlap metrics, distance metrics and MAE + NOTE: batch size must be set to one for accurate measurement, batch size larger than one may cause errors! + """ + def __init__(self, eps=1e-5, resolution=(1, 1), inf_result=np.nan): + self.eps = eps + self.resolution = resolution + self.inf_result = inf_result + + def _check_inf(self, result): + if result == np.inf: # inf 无穷 + return self.inf_result + else: + return result + + def _calculate_overlap_metrics(self, gt, pred): + output = pred.view(-1, ) + target = gt.view(-1, ).float() + + tp = torch.sum(output * target) # TP + fp = torch.sum(output * (1 - target)) # FP + fn = torch.sum((1 - output) * target) # FN + tn = torch.sum((1 - output) * (1 - target)) # TN + + pixel_acc = (tp + tn + self.eps) / (tp + tn + fp + fn + self.eps) + dice = (2 * tp + self.eps) / (2 * tp + fp + fn + self.eps) + precision = (tp + self.eps) / (tp + fp + self.eps) + recall = (tp + self.eps) / (tp + fn + self.eps) + specificity = (tn + self.eps) / (tn + fp + self.eps) + + metric_dict = dict() + metric_dict['pixel_acc'] = pixel_acc.item() + metric_dict['dice'] = dice.item() + metric_dict['precision'] = precision.item() + metric_dict['recall'] = recall.item() + metric_dict['specificity'] = specificity.item() + + return metric_dict + + def _calculate_distance_metrics(self, gt, pred): + # shape: (N, C, H, W) + gt_class = gt[0, ...].cpu().numpy().astype(np.int).astype(np.bool) # (H, W) + pred_class = pred[0, 0, ...].cpu().numpy().astype(np.int).astype(np.bool) # (H, W) + # surface_distance_dict = compute_surface_distances(gt_class, pred_class, self.resolution) + # distances = surface_distance_dict['distances_pred_to_gt'] + # mean_surface_distance = self._check_inf(np.mean(distances)) + + # compute Hausdorff distance (95 percentile) + # hd95 = self._check_inf(compute_robust_hausdorff(surface_distance_dict, percent=95)) + + metric_dict = dict() + # metric_dict['mean_surface_distance'] = mean_surface_distance + # metric_dict['hd95'] = hd95 + + return metric_dict + + def _calculate_mae(self, gt, pred): + # shape: (N, C, H, W) + residual = torch.abs(gt.unsqueeze(1) - pred) + mae = torch.mean(residual, dim=(2, 3)).squeeze().detach().cpu().numpy() + + metric_dict = dict() + metric_dict['mae'] = mae + return metric_dict + + def __call__(self, y_true, y_pred): + # y_true: (N, H, W) + # y_pred: (N, 1, H, W) + sigmoid_pred = nn.Sigmoid()(y_pred) + class_pred = (sigmoid_pred > 0.5).float().to(y_pred.device) + + assert class_pred.shape[1] == 1, 'Predictions must contain only one channel' \ + ' when performing binary segmentation' + + overlap_metrics = self._calculate_overlap_metrics(y_true.to(y_pred.device, dtype=torch.float), class_pred) + distance_metrics = self._calculate_distance_metrics(y_true, class_pred) + mae = self._calculate_mae(y_true, sigmoid_pred) + + metrics = {**overlap_metrics, **distance_metrics, **mae} + + return metrics + + +class MetricMeter(object): + """ + Metric记录器 + """ + def __init__(self, metrics): + self.metrics = metrics + self.initialization() + + def initialization(self): + for metric in self.metrics: + exec('self.' + metric + '=[]') + + def update(self, metric_dict): + """ + 将新的metric字典传入,更新记录器 + :param metric_dict: 指标字典 + :return: None + """ + for metric_key, metric_value in metric_dict.items(): + try: + exec('self.' + metric_key + '.append(metric_value)') + # exec 执行储存在字符串或文件中的 Python 语句,相比于 eval,exec可以执行更复杂的 Python 代码 + # exec(object[, globals[, locals]]) + # object:必选参数,表示需要被指定的 Python 代码。它必须是字符串或 code 对象。 + # 如果 object 是一个字符串,该字符串会先被解析为一组 Python 语句,然后再执行(除非发生语法错误)。如果 object 是一个 code 对象,那么它只是被简单的执行。 + # globals:可选参数,表示全局命名空间(存放全局变量),如果被提供,则必须是一个字典对象。 + # locals:可选参数,表示当前局部命名空间(存放局部变量),如果被提供,可以是任何映射对象。 + # 如果该参数被忽略,那么它将会取与 globals 相同的值。 + except: + continue + + def report(self, print_stats=True): + """ + 汇报目前记录的指标信息 + :param print_stats: 是否将指标信息打印在终端 + :return: report_str + """ + report_str = '' + for metric in self.metrics: + metric_mean = np.nanmean(eval('self.' + metric), axis=0) # 沿着指定的轴计算算数平均值,NAN忽略 + metric_std = np.nanstd(eval('self.' + metric), axis=0) + if print_stats: + stats = metric + ': {} ± {};'.format(np.around(metric_mean, decimals=4), # 四舍五入到小数点后的位数 + np.around(metric_std, decimals=4)) + print(stats, end=' ') + report_str += stats + return report_str + + + + + +if __name__ == '__main__': + output, gt = torch.zeros(3, 2, 5, 5), torch.zeros(3, 5, 5).long() + print(classwise_iou(output, gt)) diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/performancemetrics_ax.m b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/performancemetrics_ax.m new file mode 100644 index 0000000000000000000000000000000000000000..75218d914993902716511c3e0a56e7883f025e00 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/performancemetrics_ax.m @@ -0,0 +1,100 @@ + +% close all; +% clear all; +% clc; +N = 328 +st = 0; +Fsc=[]; +MIU=[]; +PA=[]; +bestfsc=0; +bestmiu=0; +bestpa=0; +bestep = 0; + +for k = 0:8 + k + Fsc=[]; + MIU=[]; + PA=[]; +for i = st:st+N + i; + %gname = strcat('./Brain_test/',num2str(i,'%04d'),'.png'); + + tname = '/media/jeyamariajose/7888230b-5c10-4229-90f2-c78bdae9c5de/Data/Projects/axialseg/KiU-Net-pytorch/results/brainus/mix_3_gated_wopos/'; + imgname = strcat(tname,num2str(50*k),'/',num2str(i,'%04d'),'.png'); + lname = '/media/jeyamariajose/7888230b-5c10-4229-90f2-c78bdae9c5de/Data/Brain_Ultrasound/Final/resized/test/labelcol/'; + labelname = strcat(lname, num2str(i,'%04d'),'.png'); + + I = double(imread(imgname));tmp2=zeros(128,128); + tmp2(I>131) = 255; + tmp2(I<130) = 0; + tmp = double(imread(labelname)); + tmp = tmp(:,:,1); + tmp(tmp<130)=0;tmp(tmp>131)=255; + + tp=0;fp=0;fn=0;tn=0;uni=0;ttp=0;lab=0; + + for p =1:128 + for q =1:128 + if tmp(p,q)==0 + if tmp2(p,q) == tmp(p,q) + tn = tn+1; + else + fp = fp+1; + uni = uni+1; + ttp = ttp+1; + end + elseif tmp(p,q)==255 + lab = lab +1; + if tmp2(p,q) == tmp(p,q) + tp = tp+1; + ttp = ttp+1; + else + fn = fn+1; + end + uni = uni+1; + end + + end + end + + if (tp~=0) + F = (2*tp)/(2*tp+fp+fn); + MIU=[MIU,(tp*1.0/uni)]; + PA=[PA,(tp*1.0/ttp)]; + Fsc=[Fsc;[i,F]]; + else + MIU=[MIU,1]; + PA=[PA,1]; + Fsc=[Fsc;[i,1]]; + + end + + + +end + if bestfsc <= mean(Fsc) & (mean(Fsc) ~= 1) + bestfsc = mean(Fsc); + bestmiu = mean(MIU,2); + bestpa = mean(PA,2); + bestep = 50*k; + + end + mean(Fsc) +end + +bestfsc +bestmiu +bestpa +bestep + +% plot(Fsc(:,1),Fsc(:,2),'-*') +% hold on +% plot(Fsc(:,1),Fsc1(:,2),'-s') +% hold off +% figure();plot(Fsc(:,1),PA,'-*');hold on +% plot(Fsc(:,1),PA1,'-s');hold off +% Fsc1=Fsc; +% MIU1=MIU; +% PA1=PA; diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/performancemetrics_glas.m b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/performancemetrics_glas.m new file mode 100644 index 0000000000000000000000000000000000000000..803e3d1e559b333477cec111ea23720f8a3b831c --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/performancemetrics_glas.m @@ -0,0 +1,96 @@ + +% close all; +% clear all; +% clc; +N = 79 +st = 1; +Fsc=[]; +MIU=[]; +PA=[]; +bestfsc=0; +bestmiu=0; +bestpa=0; +bestep = 0; + +for k = 1:24 + k + Fsc=[]; + MIU=[]; + PA=[]; +for i = st:st+N + i; + %gname = strcat('./Brain_test/',num2str(i,'%04d'),'.png'); + + tname = '/media/jeyamariajose/7888230b-5c10-4229-90f2-c78bdae9c5de/Data/Projects/axialseg/KiU-Net-pytorch/results/glas/medT/'; + imgname = strcat(tname,num2str(50*k),'/',num2str(i,'%02d'),'.png'); + lname = '/media/jeyamariajose/7888230b-5c10-4229-90f2-c78bdae9c5de/Data/glas/resized/test/labelcol/'; + + labelname = strcat(lname, num2str(i,'%02d'),'.png'); + + I = double(imread(imgname));tmp2=zeros(128,128); + tmp2(I>130) = 255; + tmp2(I<131) = 0; + tmp = double(imread(labelname)); + tmp = tmp(:,:,1); + tmp(tmp<130)=0;tmp(tmp>131)=255; + + + + tp=0;fp=0;fn=0;tn=0;uni=0;ttp=0;lab=0; + + for p =1:128 + for q =1:128 + if tmp(p,q)==0 + if tmp2(p,q) == tmp(p,q) + tn = tn+1; + else + fp = fp+1; + uni = uni+1; + ttp = ttp+1; + end + elseif tmp(p,q)==255 + lab = lab +1; + if tmp2(p,q) == tmp(p,q) + tp = tp+1; + ttp = ttp+1; + else + fn = fn+1; + end + uni = uni+1; + end + + end + end + + + if (tp~=0) + F = (2*tp)/(2*tp+fp+fn); + MIU=[MIU,(tp*1.0/uni)]; + PA=[PA,(tp*1.0/ttp)]; + Fsc=[Fsc;[i,F]]; + + else + MIU=[MIU,1]; + PA=[PA,1]; + Fsc=[Fsc;[i,1]]; + + end + + + +end + if bestfsc <= mean(Fsc) & (mean(Fsc) ~= 1) + bestfsc = mean(Fsc); + bestmiu = mean(MIU,2); + bestpa = mean(PA,2); + bestep = 50*k; + + end + mean(Fsc) +end + +bestfsc +bestmiu +bestpa +bestep + diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/performancemetrics_monuseg.m b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/performancemetrics_monuseg.m new file mode 100644 index 0000000000000000000000000000000000000000..af574303f354788e6969b0019daab60c7350f679 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/performancemetrics_monuseg.m @@ -0,0 +1,108 @@ + +% close all; +% clear all; +% clc; +N = 328 +st = 0; +Fsc=[]; +MIU=[]; +PA=[]; +bestfsc=0; +bestmiu=0; +bestpa=0; +bestep = 0; + +folder = '/media/jeyamariajose/7888230b-5c10-4229-90f2-c78bdae9c5de/Data/monuseg/resized/test/labelcol/'; +listinfo = dir(strcat(folder,'*.png')); +lm = length(listinfo); + + +for k = 1:10 + k + Fsc=[]; + MIU=[]; + PA=[]; +for i = 1:lm + %I = double(imread(strcat(folder,listinfo(i).name))); + imgfile = strcat(folder,listinfo(i).name); + imgname = listinfo(i).name(1:27) ; + i; + %gname = strcat('./Brain_test/',num2str(i,'%04d'),'.png'); + + lname = '/media/jeyamariajose/7888230b-5c10-4229-90f2-c78bdae9c5de/Data/Projects/axialseg/KiU-Net-pytorch/results/monuseg/medTr/'; + labelname = strcat(lname, num2str(k*10),'/', imgname); + %imgname + I = double(imread(imgfile));tmp2=zeros(512,512); + %I = rgb2gray(I); + tmp2(I>127) = 255; + tmp2(I<126) = 0; + tmp = double(imread(labelname)); + + tmp(tmp<127)=0;tmp(tmp>126)=255; + %tmp2 = I; + tp=0;fp=0;fn=0;tn=0;uni=0;ttp=0;lab=0; + + for p =1:512 + for q =1:512 + if tmp(p,q)==0 + if tmp2(p,q) == tmp(p,q) + tn = tn+1; + else + fp = fp+1; + uni = uni+1; + ttp = ttp+1; + end + elseif tmp(p,q)==255 + lab = lab +1; + if tmp2(p,q) == tmp(p,q) + tp = tp+1; + ttp = ttp+1; + else + fn = fn+1; + end + uni = uni+1; + end + + end + end + + if (tp~=0) + F = (2*tp)/(2*tp+fp+fn); + MIU=[MIU,(tp*1.0/uni)]; + PA=[PA,(tp*1.0/ttp)]; + Fsc=[Fsc;[i,F]]; + else + MIU=[MIU,1]; + PA=[PA,1]; + Fsc=[Fsc;[i,1]]; + + end + + + +end + + if bestfsc <= mean(Fsc) & (mean(Fsc) ~= 1) + bestfsc = mean(Fsc); + bestmiu = mean(MIU,2); + bestpa = mean(PA,2); + bestep = 10*k; + + end + mean(Fsc) +end + +bestfsc +bestmiu +%bestpa +bestep + +% plot(Fsc(:,1),Fsc(:,2),'-*') +% hold on +% plot(Fsc(:,1),Fsc1(:,2),'-s') +% hold off +% figure();plot(Fsc(:,1),PA,'-*');hold on +% plot(Fsc(:,1),PA1,'-s');hold off +% Fsc1=Fsc; +% MIU1=MIU; +% PA1=PA; diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/requirements.txt b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5dffd903b38e80116f9ad403468f351542a5d12 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/requirements.txt @@ -0,0 +1,4 @@ +torch>=1.4.0 +torchvision>=0.5.0 +scikit-learn==0.23.2 +scipy==1.5.3 diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/test.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/test.py new file mode 100644 index 0000000000000000000000000000000000000000..b707bd7293227e84373c97d9d6137042245c97db --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/test.py @@ -0,0 +1,149 @@ +import argparse +import lib +import torch +import torchvision +from torch import nn +from torch.autograd import Variable +from torch.utils.data import DataLoader +from torchvision import transforms +from torchvision.utils import save_image +from torchvision.datasets import MNIST +import torch.nn.functional as F +import os +import matplotlib.pyplot as plt +import torch.utils.data as data +from PIL import Image +import numpy as np +from torchvision.utils import save_image +import torch +import torch.nn.init as init +from utils import JointTransform2D, ImageToImage2D, Image2D +from metrics import jaccard_index, f1_score, LogNLLLoss,classwise_f1 +from utils import chk_mkdir, Logger, MetricList +import cv2 +from functools import partial +from random import randint + + +parser = argparse.ArgumentParser(description='MedT') +parser.add_argument('-j', '--workers', default=16, type=int, metavar='N', + help='number of data loading workers (default: 8)') +parser.add_argument('--epochs', default=100, type=int, metavar='N', + help='number of total epochs to run(default: 1)') +parser.add_argument('--start-epoch', default=0, type=int, metavar='N', + help='manual epoch number (useful on restarts)') +parser.add_argument('-b', '--batch_size', default=1, type=int, + metavar='N', help='batch size (default: 8)') +parser.add_argument('--learning_rate', default=1e-3, type=float, + metavar='LR', help='initial learning rate (default: 0.01)') +parser.add_argument('--momentum', default=0.9, type=float, metavar='M', + help='momentum') +parser.add_argument('--weight-decay', '--wd', default=1e-5, type=float, + metavar='W', help='weight decay (default: 1e-4)') +parser.add_argument('--train_dataset', type=str) +parser.add_argument('--val_dataset', type=str) +parser.add_argument('--save_freq', type=int,default = 5) +parser.add_argument('--modelname', default='off', type=str, + help='name of the model to load') +parser.add_argument('--cuda', default="on", type=str, + help='switch on/off cuda option (default: off)') + +parser.add_argument('--direc', default='./results', type=str, + help='directory to save') +parser.add_argument('--crop', type=int, default=None) +parser.add_argument('--device', default='cuda', type=str) +parser.add_argument('--loaddirec', default='load', type=str) +parser.add_argument('--imgsize', type=int, default=None) +parser.add_argument('--gray', default='no', type=str) +args = parser.parse_args() + +direc = args.direc +gray_ = args.gray +aug = args.aug +direc = args.direc +modelname = args.modelname +imgsize = args.imgsize +loaddirec = args.loaddirec + +if gray_ == "yes": + from utils_gray import JointTransform2D, ImageToImage2D, Image2D + imgchant = 1 +else: + from utils import JointTransform2D, ImageToImage2D, Image2D + imgchant = 3 + +if args.crop is not None: + crop = (args.crop, args.crop) +else: + crop = None + +tf_train = JointTransform2D(crop=crop, p_flip=0.5, color_jitter_params=None, long_mask=True) +tf_val = JointTransform2D(crop=crop, p_flip=0, color_jitter_params=None, long_mask=True) +train_dataset = ImageToImage2D(args.train_dataset, tf_val) +val_dataset = ImageToImage2D(args.val_dataset, tf_val) +predict_dataset = Image2D(args.val_dataset) +dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) +valloader = DataLoader(val_dataset, 1, shuffle=True) + +device = torch.device("cuda") + +if modelname == "axialunet": + model = lib.models.axialunet(img_size = imgsize, imgchan = imgchant) +elif modelname == "MedT": + model = lib.models.axialnet.MedT(img_size = imgsize, imgchan = imgchant) +elif modelname == "gatedaxialunet": + model = lib.models.axialnet.gated(img_size = imgsize, imgchan = imgchant) +elif modelname == "logo": + model = lib.models.axialnet.logo(img_size = imgsize, imgchan = imgchant) + +if torch.cuda.device_count() > 1: + print("Let's use", torch.cuda.device_count(), "GPUs!") + # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs + model = nn.DataParallel(model,device_ids=[0,1]).cuda() +model.to(device) + +model.load_state_dict(torch.load(loaddirec)) +model.eval() + + +for batch_idx, (X_batch, y_batch, *rest) in enumerate(valloader): + # print(batch_idx) + if isinstance(rest[0][0], str): + image_filename = rest[0][0] + else: + image_filename = '%s.png' % str(batch_idx + 1).zfill(3) + + X_batch = Variable(X_batch.to(device='cuda')) + y_batch = Variable(y_batch.to(device='cuda')) + + y_out = model(X_batch) + + tmp2 = y_batch.detach().cpu().numpy() + tmp = y_out.detach().cpu().numpy() + tmp[tmp>=0.5] = 1 + tmp[tmp<0.5] = 0 + tmp2[tmp2>0] = 1 + tmp2[tmp2<=0] = 0 + tmp2 = tmp2.astype(int) + tmp = tmp.astype(int) + + # print(np.unique(tmp2)) + yHaT = tmp + yval = tmp2 + + epsilon = 1e-20 + + del X_batch, y_batch,tmp,tmp2, y_out + + yHaT[yHaT==1] =255 + yval[yval==1] =255 + fulldir = direc+"/" + + if not os.path.isdir(fulldir): + + os.makedirs(fulldir) + + cv2.imwrite(fulldir+image_filename, yHaT[0,1,:,:]) + + + diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/train.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/train.py new file mode 100644 index 0000000000000000000000000000000000000000..ee511f00d771a4e1753da110eac2789b1508eaf6 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/train.py @@ -0,0 +1,320 @@ +# Code for MedT + +import torch +import lib +import argparse +import torch +import torchvision +from torch import nn +from torch.autograd import Variable +from torch.utils.data import DataLoader +from torchvision import transforms +from torchvision.utils import save_image +import torch.nn.functional as F +import os +import matplotlib.pyplot as plt +import torch.utils.data as data +from PIL import Image +import numpy as np +from torchvision.utils import save_image +import torch +import torch.nn.init as init +# from utils import JointTransform2D, ImageToImage2D, Image2D +from metrics import jaccard_index, f1_score, LogNLLLoss,classwise_f1, BinaryMetrics,MetricMeter +# from utils import chk_mkdir, Logger, MetricList +import cv2 +from functools import partial +from random import randint +import timeit + +from skimage.color import gray2rgb +from torch.utils.data import Dataset +from skimage.exposure import equalize_adapthist, rescale_intensity, adjust_gamma +import re +from sklearn.model_selection import train_test_split +from torchsummary import summary +from torchvision.transforms import Compose +from HNC_ZXY.utils.data_pipeline import * + +parser = argparse.ArgumentParser(description='MedT') +parser.add_argument('-j', '--workers', default=16, type=int, metavar='N', + help='number of data loading workers (default: 8)') +parser.add_argument('--epochs', default=40, type=int, metavar='N', + help='number of total epochs to run(default: 400)') +parser.add_argument('--start-epoch', default=0, type=int, metavar='N', + help='manual epoch number (useful on restarts)') +parser.add_argument('-b', '--batch_size', default=2, type=int, + metavar='N', help='batch size (default: 1)') +parser.add_argument('--learning_rate', default=1e-3, type=float, + metavar='LR', help='initial learning rate (default: 0.001)') +parser.add_argument('--momentum', default=0.9, type=float, metavar='M', + help='momentum') +parser.add_argument('--weight-decay', '--wd', default=1e-5, type=float, + metavar='W', help='weight decay (default: 1e-5)') +# parser.add_argument('--train_dataset', required=True, type=str) +# parser.add_argument('--val_dataset', type=str) +parser.add_argument('--save_freq', type=int,default = 10) + +parser.add_argument('--modelname', default='myaxial', type=str, + help='type of model') +parser.add_argument('--cuda', default="on", type=str, + help='switch on/off cuda option (default: off)') +parser.add_argument('--aug', default='off', type=str, + help='turn on img augmentation (default: False)') +parser.add_argument('--load', default='default', type=str, + help='load a pretrained model') +parser.add_argument('--save', default='default', type=str, + help='save the model') +parser.add_argument('--direc', default='./medt', type=str, + help='directory to save') +parser.add_argument('--crop', type=int, default=512) +parser.add_argument('--imgsize', type=int, default=512) +parser.add_argument('--device', default='cuda', type=str) +parser.add_argument('--gray', default='no', type=str) + +args = parser.parse_args() +gray_ = args.gray +aug = args.aug +direc = args.direc +modelname = args.modelname +imgsize = args.imgsize + +if gray_ == "yes": # 输入图像通道 + from utils_gray import JointTransform2D, ImageToImage2D, Image2D + imgchant = 1 +else: + from utils import JointTransform2D, ImageToImage2D, Image2D + # from Medical_Transformer_main import utils + imgchant = 3 + +if args.crop is not None: + crop = (args.crop, args.crop) +else: + crop = None + +# tf_train = JointTransform2D(crop=crop, p_flip=0.5, color_jitter_params=None, long_mask=True) # 调用时对图像和蒙版执行增强。 +# tf_val = JointTransform2D(crop=crop, p_flip=0, color_jitter_params=None, long_mask=True) +tf_train = Compose([Rescale((512, 512)), RandomCrop((imgsize, imgsize)), ToTensor()]) +tf_val = Compose([Rescale((imgsize, imgsize)), ToTensor()]) + + + +class SegmentationDataset(Dataset): + def __init__(self, image_root, mask_root, subject_list, transform=None, vis=False): + self.image_root = image_root + self.mask_root = mask_root + self.transform = transform + self.subject_list = subject_list + self.vis = vis + self.file_list = self.get_file_list() + + def get_file_list(self): + file_list = [] + for file in os.listdir(self.image_root): + patient_id = file.split('_')[1] # 0 + if patient_id in self.subject_list: + file_list.append(file) + return file_list + + def __len__(self): + return len(self.file_list) + + def __getitem__(self, i): + image_filename = os.path.join(self.image_root, self.file_list[i]) + # /opt/zhanglab/HYF/data/BrainStem/BrainStem/images_2d/patient_0_slice_0.npy + mask_filename = os.path.join(self.mask_root, self.file_list[i]) + # /opt/zhanglab/HYF/data/BrainStem/BrainStem/masks_2d/patient_0_slice_0.npy + data_id = self.file_list[i].split('.')[0] + # str.split(str="", num=string.count(str)) str:指定字符 通过指定字符进行分割字符串,num分割次数默认-1切割所有 + # data_id = Patient_0_Slice_0 + + + image = np.load(image_filename) + image = gray2rgb(remap_by_window(image, window_width=80, window_level=1035)) # data_pipiline 恢复成图片 + # image = gray2rgb(rescale_intensity(image, out_range=np.uint8)).astype(np.uint8) + mask = np.load(mask_filename) + + sample = {'image': image, 'mask': mask} + if self.transform: + sample = self.transform(sample) + + if self.vis: + return sample, data_id + else: + return sample + +def remap_by_window(float_data, window_width, window_level): # 取CT的HU窗 + """ + CT window transform + """ + low = int(window_level - window_width // 2) + high = int(window_level + window_width // 2) + output = rescale_intensity(float_data, in_range=(low, high), out_range=np.uint8).astype(np.uint8) + # rescale_intensity(image, in_range=’image’, out_range=’dtype’) + # skimage.exposure.exposure 模块中的函数,在对图像进行拉伸或者伸缩强度水平后返回修改后的图像 + # 输入图像和输出图像的强度范围分别由in_range 和out_range指定,用来拉伸或缩小输入图像的强度范围 + return output + + +# image_root = '/data/zhanglab_headneck/beiyisanyuan/Lens_R/images_2d' +image_root = '/home/zhanglab3090/headneck/beiyisanyuan/Lens_R/images_2d' +# mask_root = '/data/zhanglab_headneck/beiyisanyuan/Lens_R/masks_2d' +mask_root = '/home/zhanglab3090/headneck/beiyisanyuan/Lens_R/masks_2d' +model_savedir = '/home/zhanglab_headneck/HYF/HNC_ZXY/checkpoints/Lens_R/medt/lognlloss_256_4_1' +# subject_list = [re.findall('(\d+)', file)[0] for file in os.listdir('/data/zhanglab_headneck/beiyisanyuan/Lens_R/images')] +subject_list = [re.findall('(\d+)', file)[0] for file in os.listdir('/home/zhanglab3090/headneck/beiyisanyuan/Lens_R/images')] +train_list, val_list = train_test_split(subject_list, test_size=0.2, random_state=512) +train_dataset = SegmentationDataset(image_root, mask_root, train_list, transform=tf_train) +val_dataset = SegmentationDataset(image_root, mask_root, val_list, transform=tf_val) + + +# train_dataset = ImageToImage2D(args.train_dataset, tf_train) +# val_dataset = ImageToImage2D(args.val_dataset, tf_val) +# predict_dataset = Image2D(args.val_dataset) +dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) +valloader = DataLoader(val_dataset, 1, shuffle=True) + +device = torch.device("cuda") + +if modelname == "axialunet": + model = lib.models.axialunet(img_size = imgsize, imgchan = imgchant) +elif modelname == "MedT": + model = lib.models.axialnet.MedT(img_size = imgsize, imgchan = imgchant) +elif modelname == "gatedaxialunet": + model = lib.models.axialnet.gated(img_size = imgsize, imgchan = imgchant) +elif modelname == "logo": + model = lib.models.axialnet.logo(img_size = imgsize, imgchan = imgchant) +elif modelname == "myaxial": + model = lib.models.myaxialnet.mylogo(img_size = imgsize, imgchan = imgchant) + +if torch.cuda.device_count() > 1: + print("Let's use", torch.cuda.device_count(), "GPUs!") + # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs + model = nn.DataParallel(model, device_ids=[0,1]).cuda() +model.to(device) +summary(model, input_size=(3, 512, 512), batch_size=-1) + +# summary(model, input_size=(3, 256, 256), batch_size=-1) +criterion = LogNLLLoss() + +optimizer = torch.optim.Adam(list(model.parameters()), lr=args.learning_rate, + weight_decay=1e-5) + + +pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) +print("Total_params: {}M".format(pytorch_total_params/1e6)) + +seed = 3000 +np.random.seed(seed) +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +# torch.set_deterministic(True) +# random.seed(seed) + + +for epoch in range(args.epochs): + print('epoch: ', epoch) + epoch_running_loss = 0 + + for batch_idx, (X_batch, y_batch, *rest) in enumerate(dataloader): + + + + X_batch = Variable(X_batch.to(device ='cuda')) + y_batch = Variable(y_batch.to(device='cuda')) + + # ===================forward===================== + + + output = model(X_batch) + + tmp2 = y_batch.detach().cpu().numpy() + tmp = output.detach().cpu().numpy() + tmp[tmp>=0.5] = 1 + tmp[tmp<0.5] = 0 + tmp2[tmp2>0] = 1 + tmp2[tmp2<=0] = 0 + tmp2 = tmp2.astype(int) + tmp = tmp.astype(int) + + yHaT = tmp + yval = tmp2 + + + + loss = criterion(output, y_batch) + + # ===================backward==================== + optimizer.zero_grad() + loss.backward() + optimizer.step() + epoch_running_loss += loss.item() + + # ===================log======================== + print('epoch [{}/{}], loss:{:.4f}' + .format(epoch, args.epochs, epoch_running_loss/(batch_idx+1))) + + + if epoch == 10: + for param in model.parameters(): + param.requires_grad =True + if (epoch % args.save_freq) ==0: + metric_list = ['pixel_acc', 'dice', 'precision', 'recall', 'specificity', 'mean_surface_distance'] + metric_meter = MetricMeter(metrics=metric_list) + + for batch_idx, (X_batch, y_batch, *rest) in enumerate(valloader): + # print(batch_idx) + if isinstance(rest[0][0], str): + image_filename = rest[0][0] + else: + image_filename = '%s.png' % str(batch_idx + 1).zfill(3) + + X_batch = Variable(X_batch.to(device='cuda')) + y_batch = Variable(y_batch.to(device='cuda')) + # start = timeit.default_timer() + y_out = model(X_batch) + # stop = timeit.default_timer() + # print('Time: ', stop - start) + tmp2 = y_batch.detach().cpu().numpy() + tmp = y_out.detach().cpu().numpy() + tmp[tmp>=0.5] = 1 + tmp[tmp<0.5] = 0 + tmp2[tmp2>0] = 1 + tmp2[tmp2<=0] = 0 + tmp2 = tmp2.astype(int) + tmp = tmp.astype(int) + + # print(np.unique(tmp2)) + yHaT = tmp + yval = tmp2 + + epsilon = 1e-20 + + metrics = BinaryMetrics()(tmp2, tmp) # 返回列表内含各种评价方式 + metric_meter.update(metrics) # 更新 update + + # print('[ Validation ] Loss: {:.4f}'.format(np.mean(loss_list)), end=' ') + metric_meter.report(print_stats=True) + + + # del X_batch, y_batch,tmp,tmp2, y_out + # + # + # yHaT[yHaT==1] =255 + # yval[yval==1] =255 + # fulldir = direc+"/{}/".format(epoch) + # print(fulldir+image_filename) + # if not os.path.isdir(fulldir): + # + # os.makedirs(fulldir) + # + # cv2.imwrite(fulldir+image_filename, yHaT[0,1,:,:]) + # # cv2.imwrite(fulldir+'/gt_{}.png'.format(count), yval[0,:,:]) + # fulldir = direc+"/{}/".format(epoch) + # torch.save(model.state_dict(), fulldir+args.modelname+".pth") + # torch.save(model.state_dict(), direc+"final_model.pth") + + + + + diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/utils.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7fe23fcc8dba7372315e56abcf0f97292fe974fb --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/utils.py @@ -0,0 +1,285 @@ +import os +import numpy as np +import torch + +from skimage import io,color +from PIL import Image +from torch.utils.data import Dataset +from torchvision import transforms as T +from torchvision.transforms import functional as F + +from typing import Callable +import os +import cv2 +import pandas as pd + +from numbers import Number +from typing import Container +from collections import defaultdict + + +def to_long_tensor(pic): + # handle numpy array + img = torch.from_numpy(np.array(pic, np.uint8)) + # backward compatibility + return img.long() + + +def correct_dims(*images): + corr_images = [] + # print(images) + for img in images: + if len(img.shape) == 2: + corr_images.append(np.expand_dims(img, axis=2)) + else: + corr_images.append(img) + + if len(corr_images) == 1: + return corr_images[0] + else: + return corr_images + + +class JointTransform2D: + """ + Performs augmentation on image and mask when called. Due to the randomness of augmentation transforms, + it is not enough to simply apply the same Transform from torchvision on the image and mask separetely. + Doing this will result in messing up the ground truth mask. To circumvent this problem, this class can + be used, which will take care of the problems above. + + Args: + crop: tuple describing the size of the random crop. If bool(crop) evaluates to False, no crop will + be taken. + p_flip: float, the probability of performing a random horizontal flip. + color_jitter_params: tuple describing the parameters of torchvision.transforms.ColorJitter. + If bool(color_jitter_params) evaluates to false, no color jitter transformation will be used. + p_random_affine: float, the probability of performing a random affine transform using + torchvision.transforms.RandomAffine. + long_mask: bool, if True, returns the mask as LongTensor in label-encoded format. + """ + def __init__(self, crop=(32, 32), p_flip=0.5, color_jitter_params=(0.1, 0.1, 0.1, 0.1), + p_random_affine=0, long_mask=False): + self.crop = crop + self.p_flip = p_flip + self.color_jitter_params = color_jitter_params + if color_jitter_params: + self.color_tf = T.ColorJitter(*color_jitter_params) + self.p_random_affine = p_random_affine + self.long_mask = long_mask + + def __call__(self, image, mask): + # transforming to PIL image + image = image.astype(np.uint8) + mask = mask.astype(np.uint8) + + image, mask = F.to_pil_image(image), F.to_pil_image(mask) + + # random crop + if self.crop: + i, j, h, w = T.RandomCrop.get_params(image, self.crop) + image, mask = F.crop(image, i, j, h, w), F.crop(mask, i, j, h, w) + + if np.random.rand() < self.p_flip: + image, mask = F.hflip(image), F.hflip(mask) + + # color transforms || ONLY ON IMAGE + if self.color_jitter_params: + image = self.color_tf(image) + + # random affine transform + if np.random.rand() < self.p_random_affine: + affine_params = T.RandomAffine(180).get_params((-90, 90), (1, 1), (2, 2), (-45, 45), self.crop) + image, mask = F.affine(image, *affine_params), F.affine(mask, *affine_params) + + # transforming to tensor + image = F.to_tensor(image) + if not self.long_mask: + mask = F.to_tensor(mask) + else: + mask = to_long_tensor(mask) + + return image, mask + + +class ImageToImage2D(Dataset): + """ + Reads the images and applies the augmentation transform on them. + Usage: + 1. If used without the unet.model.Model wrapper, an instance of this object should be passed to + torch.utils.data.DataLoader. Iterating through this returns the tuple of image, mask and image + filename. + 2. With unet.model.Model wrapper, an instance of this object should be passed as train or validation + datasets. + + Args: + dataset_path: path to the dataset. Structure of the dataset should be: + dataset_path + |-- images + |-- img001.png + |-- img002.png + |-- ... + |-- masks + |-- img001.png + |-- img002.png + |-- ... + + joint_transform: augmentation transform, an instance of JointTransform2D. If bool(joint_transform) + evaluates to False, torchvision.transforms.ToTensor will be used on both image and mask. + one_hot_mask: bool, if True, returns the mask in one-hot encoded form. + """ + + def __init__(self, dataset_path: str, joint_transform: Callable = None, one_hot_mask: int = False) -> None: + self.dataset_path = dataset_path + self.input_path = os.path.join(dataset_path, 'img') + self.output_path = os.path.join(dataset_path, 'labelcol') + self.images_list = os.listdir(self.input_path) + self.one_hot_mask = one_hot_mask + + if joint_transform: + self.joint_transform = joint_transform + else: + to_tensor = T.ToTensor() + self.joint_transform = lambda x, y: (to_tensor(x), to_tensor(y)) + + def __len__(self): + return len(os.listdir(self.input_path)) + + def __getitem__(self, idx): + image_filename = self.images_list[idx] + #print(image_filename[: -3]) + # read image + # print(os.path.join(self.input_path, image_filename)) + # print(os.path.join(self.output_path, image_filename[: -3] + "png")) + # print(os.path.join(self.input_path, image_filename)) + image = cv2.imread(os.path.join(self.input_path, image_filename)) + # print(image.shape) + # read mask image + mask = cv2.imread(os.path.join(self.output_path, image_filename[: -3] + "png"),0) + + mask[mask<=127] = 0 + mask[mask>127] = 1 + # correct dimensions if needed + image, mask = correct_dims(image, mask) + # print(image.shape) + + if self.joint_transform: + image, mask = self.joint_transform(image, mask) + + if self.one_hot_mask: + assert self.one_hot_mask > 0, 'one_hot_mask must be nonnegative' + mask = torch.zeros((self.one_hot_mask, mask.shape[1], mask.shape[2])).scatter_(0, mask.long(), 1) + # mask = np.swapaxes(mask,2,0) + # print(image.shape) + # print(mask.shape) + # mask = np.transpose(mask,(2,0,1)) + # image = np.transpose(image,(2,0,1)) + # print(image.shape) + # print(mask.shape) + + return image, mask, image_filename + + +class Image2D(Dataset): + """ + Reads the images and applies the augmentation transform on them. As opposed to ImageToImage2D, this + reads a single image and requires a simple augmentation transform. + Usage: + 1. If used without the unet.model.Model wrapper, an instance of this object should be passed to + torch.utils.data.DataLoader. Iterating through this returns the tuple of image and image + filename. + 2. With unet.model.Model wrapper, an instance of this object should be passed as a prediction + dataset. + + Args: + + dataset_path: path to the dataset. Structure of the dataset should be: + dataset_path + |-- images + |-- img001.png + |-- img002.png + |-- ... + + transform: augmentation transform. If bool(joint_transform) evaluates to False, + torchvision.transforms.ToTensor will be used. + """ + + def __init__(self, dataset_path: str, transform: Callable = None): + + self.dataset_path = dataset_path + self.input_path = os.path.join(dataset_path, 'img') + self.images_list = os.listdir(self.input_path) + + if transform: + self.transform = transform + else: + self.transform = T.ToTensor() + + def __len__(self): + return len(os.listdir(self.input_path)) + + def __getitem__(self, idx): + + image_filename = self.images_list[idx] + + image = cv2.imread(os.path.join(self.input_path, image_filename)) + + # image = np.transpose(image,(2,0,1)) + + image = correct_dims(image) + + image = self.transform(image) + + # image = np.swapaxes(image,2,0) + + return image, image_filename + +def chk_mkdir(*paths: Container) -> None: + """ + Creates folders if they do not exist. + + Args: + paths: Container of paths to be created. + """ + for path in paths: + if not os.path.exists(path): + os.makedirs(path) + + +class Logger: + def __init__(self, verbose=False): + self.logs = defaultdict(list) + self.verbose = verbose + + def log(self, logs): + for key, value in logs.items(): + self.logs[key].append(value) + + if self.verbose: + print(logs) + + def get_logs(self): + return self.logs + + def to_csv(self, path): + pd.DataFrame(self.logs).to_csv(path, index=None) + + +class MetricList: + def __init__(self, metrics): + assert isinstance(metrics, dict), '\'metrics\' must be a dictionary of callables' + self.metrics = metrics + self.results = {key: 0.0 for key in self.metrics.keys()} + + def __call__(self, y_out, y_batch): + for key, value in self.metrics.items(): + self.results[key] += value(y_out, y_batch) + + def reset(self): + self.results = {key: 0.0 for key in self.metrics.keys()} + + def get_results(self, normalize=False): + assert isinstance(normalize, bool) or isinstance(normalize, Number), '\'normalize\' must be boolean or a number' + if not normalize: + return self.results + else: + return {key: value/normalize for key, value in self.results.items()} diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/utils_gray.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/utils_gray.py new file mode 100644 index 0000000000000000000000000000000000000000..6c9b4253932036bbb6579a559c14ff82179988ee --- /dev/null +++ b/PuzzleTuning/SSL_structures/Medical_Transformer_main/Medical_Transformer_main/utils_gray.py @@ -0,0 +1,283 @@ +import os +import numpy as np +import torch + +from skimage import io,color +from PIL import Image +from torch.utils.data import Dataset +from torchvision import transforms as T +from torchvision.transforms import functional as F + +from typing import Callable +import os +import cv2 +import pandas as pd + +from numbers import Number +from typing import Container +from collections import defaultdict + + +def to_long_tensor(pic): + # handle numpy array + img = torch.from_numpy(np.array(pic, np.uint8)) + # backward compatibility + return img.long() + + +def correct_dims(*images): + corr_images = [] + # print(images) + for img in images: + if len(img.shape) == 2: + corr_images.append(np.expand_dims(img, axis=2)) + else: + corr_images.append(img) + + if len(corr_images) == 1: + return corr_images[0] + else: + return corr_images + + +class JointTransform2D: + """ + Performs augmentation on image and mask when called. Due to the randomness of augmentation transforms, + it is not enough to simply apply the same Transform from torchvision on the image and mask separetely. + Doing this will result in messing up the ground truth mask. To circumvent this problem, this class can + be used, which will take care of the problems above. + + Args: + crop: tuple describing the size of the random crop. If bool(crop) evaluates to False, no crop will + be taken. + p_flip: float, the probability of performing a random horizontal flip. + color_jitter_params: tuple describing the parameters of torchvision.transforms.ColorJitter. + If bool(color_jitter_params) evaluates to false, no color jitter transformation will be used. + p_random_affine: float, the probability of performing a random affine transform using + torchvision.transforms.RandomAffine. + long_mask: bool, if True, returns the mask as LongTensor in label-encoded format. + """ + def __init__(self, crop=(32, 32), p_flip=0.5, color_jitter_params=(0.1, 0.1, 0.1, 0.1), + p_random_affine=0, long_mask=False): + self.crop = crop + self.p_flip = p_flip + self.color_jitter_params = color_jitter_params + if color_jitter_params: + self.color_tf = T.ColorJitter(*color_jitter_params) + self.p_random_affine = p_random_affine + self.long_mask = long_mask + + def __call__(self, image, mask): + # transforming to PIL image + image, mask = F.to_pil_image(image), F.to_pil_image(mask) + + # random crop + if self.crop: + i, j, h, w = T.RandomCrop.get_params(image, self.crop) + image, mask = F.crop(image, i, j, h, w), F.crop(mask, i, j, h, w) + + if np.random.rand() < self.p_flip: + image, mask = F.hflip(image), F.hflip(mask) + + # color transforms || ONLY ON IMAGE + if self.color_jitter_params: + image = self.color_tf(image) + + # random affine transform + if np.random.rand() < self.p_random_affine: + affine_params = T.RandomAffine(180).get_params((-90, 90), (1, 1), (2, 2), (-45, 45), self.crop) + image, mask = F.affine(image, *affine_params), F.affine(mask, *affine_params) + + # transforming to tensor + image = F.to_tensor(image) + if not self.long_mask: + mask = F.to_tensor(mask) + else: + mask = to_long_tensor(mask) + + return image, mask + + +class ImageToImage2D(Dataset): + """ + Reads the images and applies the augmentation transform on them. + Usage: + 1. If used without the unet.model.Model wrapper, an instance of this object should be passed to + torch.utils.data.DataLoader. Iterating through this returns the tuple of image, mask and image + filename. + 2. With unet.model.Model wrapper, an instance of this object should be passed as train or validation + datasets. + + Args: + dataset_path: path to the dataset. Structure of the dataset should be: + dataset_path + |-- images + |-- img001.png + |-- img002.png + |-- ... + |-- masks + |-- img001.png + |-- img002.png + |-- ... + + joint_transform: augmentation transform, an instance of JointTransform2D. If bool(joint_transform) + evaluates to False, torchvision.transforms.ToTensor will be used on both image and mask. + one_hot_mask: bool, if True, returns the mask in one-hot encoded form. + """ + + def __init__(self, dataset_path: str, joint_transform: Callable = None, one_hot_mask: int = False) -> None: + self.dataset_path = dataset_path + self.input_path = os.path.join(dataset_path, 'img') + self.output_path = os.path.join(dataset_path, 'labelcol') + self.images_list = os.listdir(self.input_path) + self.one_hot_mask = one_hot_mask + + if joint_transform: + self.joint_transform = joint_transform + else: + to_tensor = T.ToTensor() + self.joint_transform = lambda x, y: (to_tensor(x), to_tensor(y)) + + def __len__(self): + return len(os.listdir(self.input_path)) + + def __getitem__(self, idx): + image_filename = self.images_list[idx] + #print(image_filename[: -3]) + # read image + # print(os.path.join(self.input_path, image_filename)) + # print(os.path.join(self.output_path, image_filename[: -3] + "png")) + # print(os.path.join(self.input_path, image_filename)) + image = cv2.imread(os.path.join(self.input_path, image_filename),0) + # print(image.shape) + # read mask image + mask = cv2.imread(os.path.join(self.output_path, image_filename[: -3] + "png"),0) + + # correct dimensions if needed + image, mask = correct_dims(image, mask) + # print(image.shape) + mask[mask<127] = 0 + mask[mask>=127] = 1 + + + if self.joint_transform: + image, mask = self.joint_transform(image, mask) + + if self.one_hot_mask: + assert self.one_hot_mask > 0, 'one_hot_mask must be nonnegative' + mask = torch.zeros((self.one_hot_mask, mask.shape[1], mask.shape[2])).scatter_(0, mask.long(), 1) + # mask = np.swapaxes(mask,2,0) + # print(image.shape) + # print(mask.shape) + # mask = np.transpose(mask,(2,0,1)) + # image = np.transpose(image,(2,0,1)) + # print(image.shape) + # print(mask.shape) + + return image, mask, image_filename + + +class Image2D(Dataset): + """ + Reads the images and applies the augmentation transform on them. As opposed to ImageToImage2D, this + reads a single image and requires a simple augmentation transform. + Usage: + 1. If used without the unet.model.Model wrapper, an instance of this object should be passed to + torch.utils.data.DataLoader. Iterating through this returns the tuple of image and image + filename. + 2. With unet.model.Model wrapper, an instance of this object should be passed as a prediction + dataset. + + Args: + + dataset_path: path to the dataset. Structure of the dataset should be: + dataset_path + |-- images + |-- img001.png + |-- img002.png + |-- ... + + transform: augmentation transform. If bool(joint_transform) evaluates to False, + torchvision.transforms.ToTensor will be used. + """ + + def __init__(self, dataset_path: str, transform: Callable = None): + + self.dataset_path = dataset_path + self.input_path = os.path.join(dataset_path, 'img') + self.images_list = os.listdir(self.input_path) + + if transform: + self.transform = transform + else: + self.transform = T.ToTensor() + + def __len__(self): + return len(os.listdir(self.input_path)) + + def __getitem__(self, idx): + + image_filename = self.images_list[idx] + + image = cv2.imread(os.path.join(self.input_path, image_filename),0) + + # image = np.transpose(image,(2,0,1)) + + image = correct_dims(image) + + image = self.transform(image) + + # image = np.swapaxes(image,2,0) + + return image, image_filename + +def chk_mkdir(*paths: Container) -> None: + """ + Creates folders if they do not exist. + + Args: + paths: Container of paths to be created. + """ + for path in paths: + if not os.path.exists(path): + os.makedirs(path) + + +class Logger: + def __init__(self, verbose=False): + self.logs = defaultdict(list) + self.verbose = verbose + + def log(self, logs): + for key, value in logs.items(): + self.logs[key].append(value) + + if self.verbose: + print(logs) + + def get_logs(self): + return self.logs + + def to_csv(self, path): + pd.DataFrame(self.logs).to_csv(path, index=None) + + +class MetricList: + def __init__(self, metrics): + assert isinstance(metrics, dict), '\'metrics\' must be a dictionary of callables' + self.metrics = metrics + self.results = {key: 0.0 for key in self.metrics.keys()} + + def __call__(self, y_out, y_batch): + for key, value in self.metrics.items(): + self.results[key] += value(y_out, y_batch) + + def reset(self): + self.results = {key: 0.0 for key in self.metrics.keys()} + + def get_results(self, normalize=False): + assert isinstance(normalize, bool) or isinstance(normalize, Number), '\'normalize\' must be boolean or a number' + if not normalize: + return self.results + else: + return {key: value/normalize for key, value in self.results.items()} diff --git a/PuzzleTuning/SSL_structures/Medical_Transformer_main/__init__.py b/PuzzleTuning/SSL_structures/Medical_Transformer_main/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PuzzleTuning/SSL_structures/SAE.py b/PuzzleTuning/SSL_structures/SAE.py new file mode 100644 index 0000000000000000000000000000000000000000..274427cc8417070fd1c8468eae526b810ac08c15 --- /dev/null +++ b/PuzzleTuning/SSL_structures/SAE.py @@ -0,0 +1,798 @@ +""" +SAE Model Script ver: Oct 28th 2023 15:30 +SAE stands for shuffled autoencoder, designed for PuzzleTuning + +# References: +Based on MAE code. +https://github.com/facebookresearch/mae + +""" + +from functools import partial + +import torch +import torch.nn as nn + +from timm.models.vision_transformer import PatchEmbed, Block + +from SSL_structures.pos_embed import get_2d_sincos_pos_embed + +from Backbone.VPT_structure import VPT_ViT + + +class ShuffledAutoEncoderViT(VPT_ViT): + """ + Shuffled Autoencoder with VisionTransformer backbone + + prompt_mode: "Deep" / "Shallow" by default None + """ + + def __init__(self, img_size=224, patch_size=16, in_chans=3, + embed_dim=1024, depth=24, num_heads=16, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4., norm_layer=nn.LayerNorm, norm_pix_loss=False, group_shuffle_size=-1, + prompt_mode=None, Prompt_Token_num=20, basic_state_dict=None, decoder=None, decoder_rep_dim=None): + + if prompt_mode is None: + super().__init__() + # SAE encoder specifics (this part just the same as ViT) + # -------------------------------------------------------------------------- + self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim) # BCHW -> BNC + num_patches = self.patch_embed.num_patches + + # learnable cls token is still used but on cls head need + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + # set and freeze encoder_pos_embed, use the fixed sin-cos embedding for tokens + mask_token + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim), requires_grad=False) + # Encoder blocks + self.blocks = nn.ModuleList([ # qk_scale=None fixme related to timm version + Block(embed_dim, num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) + + self.prompt_mode = prompt_mode + # -------------------------------------------------------------------------- + + else: + super().__init__(img_size=img_size, patch_size=patch_size, in_chans=in_chans, + embed_dim=embed_dim, depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, + norm_layer=norm_layer, Prompt_Token_num=Prompt_Token_num, VPT_type=prompt_mode, + basic_state_dict=None) # Firstly, set then Encoder state_dict to none here. + num_patches = self.patch_embed.num_patches # set patch_embed of VPT + # set and freeze encoder_pos_embed, use the fixed sin-cos embedding for tokens + mask_token + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim), requires_grad=False) + + self.prompt_mode = prompt_mode + # Freeze Encoder parameters except of the Prompt Tokens + self.Freeze() + + # SAE decoder specifics todo as a low-level backbone, the explore for future segmentation is need + # -------------------------------------------------------------------------- + # if the feature dimension of encoder and decoder are different, use decoder_embed to align them + if embed_dim != decoder_embed_dim: + self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim, bias=True) + else: + self.decoder_embed = nn.Identity() + + # set decoder + if decoder is not None: + self.decoder = decoder + # Decoder use a FC to reconstruct image, unlike the Encoder which use a CNN to split patch + self.decoder_pred = nn.Linear(decoder_rep_dim, patch_size ** 2 * in_chans, bias=True) # decoder to patch + + else: + self.decoder = None + # set and freeze decoder_pos_embed, use the fixed sin-cos embedding for tokens + mask_token + self.decoder_pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, decoder_embed_dim), + requires_grad=False) + self.decoder_blocks = nn.ModuleList([ # qk_scale=None fixme related to timm version + Block(decoder_embed_dim, decoder_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer) + for i in range(decoder_depth)]) + self.decoder_norm = norm_layer(decoder_embed_dim) + + # Decoder use a FC to reconstruct image, unlike the Encoder which use a CNN to split patch + self.decoder_pred = nn.Linear(decoder_embed_dim, patch_size ** 2 * in_chans, bias=True) # decoder to patch + + # -------------------------------------------------------------------------- + # this controls the puzzle group + self.group_shuffle_size = group_shuffle_size + + # wether or not to use norm_pix_loss + self.norm_pix_loss = norm_pix_loss + # parameter initialization + self.initialize_weights() + + # load basic state_dict of backbone for Transfer-learning-based tuning + if basic_state_dict is not None: + self.load_state_dict(basic_state_dict, False) + + def initialize_weights(self): + # initialization + # initialize a 2d positional encoding of (embed_dim, grid) by sin-cos embedding + pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], + int(self.patch_embed.num_patches ** .5), + cls_token=True) + self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0)) + + if self.decoder is None: + # initialize a 2d positional encoding of (embed_dim, grid) by sin-cos embedding + decoder_pos_embed = get_2d_sincos_pos_embed(self.decoder_pos_embed.shape[-1], + int(self.patch_embed.num_patches ** .5), + cls_token=True) + self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0)) + + # initialize patch_embed like nn.Linear (instead of nn.Conv2d) + w = self.patch_embed.proj.weight.data + torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) # xavier_uniform,让输入输出的方差相同,包括前后向传播 + + # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.) + torch.nn.init.normal_(self.cls_token, std=.02) + # torch.nn.init.normal_(self.prompt_token, std=.02) + + # initialize nn.Linear and nn.LayerNorm + self.apply(self._init_weights) + + def _init_weights(self, m): + # initialize nn.Linear and nn.LayerNorm + if isinstance(m, nn.Linear): + # we use xavier_uniform following official JAX ViT: + torch.nn.init.xavier_uniform_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def patchify(self, imgs, patch_size=None): + """ + Break image to patch tokens + + input: + imgs: (B, 3, H, W) + + output: + x: (B, num_patches, patch_size**2 *3) AKA [B, num_patches, flatten_dim] + """ + # patch_size + patch_size = self.patch_embed.patch_size[0] if patch_size is None else patch_size + + # assert H == W and image shape is dividedable by patch + assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % patch_size == 0 + # patch num in rol or column + h = w = imgs.shape[2] // patch_size + + # use reshape to split patch [B, C, H, W] -> [B, C, h_p, patch_size, w_p, patch_size] + x = imgs.reshape(shape=(imgs.shape[0], 3, h, patch_size, w, patch_size)) + + # ReArrange dimensions [B, C, h_p, patch_size, w_p, patch_size] -> [B, h_p, w_p, patch_size, patch_size, C] + x = torch.einsum('nchpwq->nhwpqc', x) + # ReArrange dimensions [B, h_p, w_p, patch_size, patch_size, C] -> [B, num_patches, flatten_dim] + x = x.reshape(shape=(imgs.shape[0], h * w, patch_size ** 2 * 3)) + return x + + def patchify_decoder(self, imgs, patch_size=None): + """ + Break image to patch tokens + + fixme,注意,这里patch_size应该是按照decoder的网络设置来作为default更合理 + + input: + imgs: (B, CLS, H, W) + + output: + x: (B, num_patches, -1) AKA [B, num_patches, -1] + """ + # patch_size + patch_size = self.patch_embed.patch_size[0] if patch_size is None else patch_size + + # assert H == W and image shape is divided-able by patch + assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % patch_size == 0 + # patch num in rol or column + h = w = imgs.shape[2] // patch_size + + # use reshape to split patch [B, CLS, H, W] -> [B, CLS, h_p, patch_size, w_p, patch_size] + x = imgs.reshape(shape=(imgs.shape[0], -1, h, patch_size, w, patch_size)) + + # ReArrange dimensions [B, CLS, h_p, patch_size, w_p, patch_size] -> [B, h_p, w_p, patch_size, patch_size, CLS] + x = torch.einsum('nchpwq->nhwpqc', x) + # ReArrange dimensions [B, h_p, w_p, patch_size, patch_size, C] -> [B, num_patches, flatten_dim] + x = x.reshape(shape=(imgs.shape[0], h * w, -1)) + return x + + def unpatchify(self, x, patch_size=None): + """ + Decoding encoded patch tokens + + input: + x: (B, num_patches, patch_size**2 *3) AKA [B, num_patches, flatten_dim] + + output: + imgs: (B, 3, H, W) + """ + # patch_size + p = self.patch_embed.patch_size[0] if patch_size is None else patch_size + + # squre root of num_patches (without CLS token is required) + h = w = int(x.shape[1] ** .5) + # assert num_patches is with out CLS token + assert h * w == x.shape[1] + + # ReArrange dimensions [B, num_patches, flatten_dim] -> [B, h_p, w_p, patch_size, patch_size, C] + x = x.reshape(shape=(x.shape[0], h, w, p, p, 3)) + # ReArrange dimensions [B, h_p, w_p, patch_size, patch_size, C] -> [B, C, h_p, patch_size, w_p, patch_size] + x = torch.einsum('nhwpqc->nchpwq', x) + # use reshape to compose patch [B, C, h_p, patch_size, w_p, patch_size] -> [B, C, H, W] + imgs = x.reshape(shape=(x.shape[0], 3, h * p, h * p)) + return imgs + + def fix_position_shuffling(self, x, fix_position_ratio, puzzle_patch_size): + """ + Fix-position shuffling + + Randomly assign patches by per-sample shuffling. + After it, the fixed patches are reserved as Positional Tokens + the rest patches are batch wise randomly shuffled among the batch since they serve as Relation Tokens. + + Per-sample shuffling is done by argsort random noise. + batch wise shuffle operation is done by shuffle all idxes + + input: + x: [B, 3, H, W], input image tensor + fix_position_ratio float + puzzle_patch_size int + + output: x_puzzled, mask + x_puzzled: [B, 3, H, W] + mask: [B, 3, H, W], binary mask indicating pix position with 0 + """ + # Break img into puzzle patches with the size of puzzle_patch_size [B, num_puzzle_patches, D_puzzle] + x = self.patchify(x, puzzle_patch_size) + # output: x: (B, num_patches, patch_size**2 *3) AKA [B, num_patches, flatten_dim] + B, num_puzzle_patches, D = x.shape + + # num of fix_position puzzle patches + len_fix_position = int(num_puzzle_patches * fix_position_ratio) + num_shuffled_patches = num_puzzle_patches - len_fix_position + # create a noise tensor to prepare shuffle idx of puzzle patches + noise = torch.rand(B, num_puzzle_patches, device=x.device) # [B,num_puzzle_patches] noise in [0, 1] + + # 在Batch里面每个序列上获得noise tensor经过升序排列后原本位置的idx矩阵,(各自不同) + ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove + # 再对idx矩阵继续升序排列可获得:原始noise tensor的每个位置的排序顺位 + ids_restore = torch.argsort(ids_shuffle, dim=1) + + # keep the first subset 前面的是fix的,后面的是puzzle的 + ids_fix = ids_shuffle[:, :len_fix_position] # [B,num_puzzle_patches] -> [B,fix_patches] + # fix_patches=num_puzzle_patches * fix_position_ratio len_fix_position + ids_puzzle = ids_shuffle[:, len_fix_position:] # [B,num_puzzle_patches] -> [B,puzzle_patches] + # puzzle_patches=num_puzzle_patches*(1-fix_position_ratio) num_shuffled_patches + + # set puzzle patch + # ids_?.unsqueeze(-1).repeat(1, 1, D) + # [B,?_patches] -> [B,?_patches,1] (at each place with the idx of ori patch) -> [B,?_patches,D] + + # torch.gather to select patche groups x_fixed of [B,fix_patches,D] and x_puzzle of [B,puzzle_patches,D] + # 要保持的,batch中每个sample不一样 + x_fixed = torch.gather(x, dim=1, index=ids_fix.unsqueeze(-1).repeat(1, 1, D)) + # 要shuffle的,batch中每个sample不一样 + x_puzzle = torch.gather(x, dim=1, index=ids_puzzle.unsqueeze(-1).repeat(1, 1, D)) + + # batch&patch-wise shuffle is needed else the restore will restore all puzzles + if self.group_shuffle_size == -1 or self.group_shuffle_size == B: + puzzle_shuffle_indices = torch.randperm(B * num_shuffled_patches, device=x.device, requires_grad=False) + else: + assert B > self.group_shuffle_size > 0 and B % self.group_shuffle_size == 0 + # build [B//self.group_shuffle_size, num_puzzle_patches] noise in [0, 1] + group_noise = torch.rand(B // self.group_shuffle_size, num_shuffled_patches * self.group_shuffle_size, device=x.device) + # get shuffled index in each (num_shuffled_patches*group_shuffle) + group_ids_shuffle = torch.argsort(group_noise, dim=1) + # break the dim and add the group idx(in list), stack back to tensor + group_ids_shuffle = torch.stack([group_ids_shuffle[i] + + num_shuffled_patches * self.group_shuffle_size * i + for i in range(B // self.group_shuffle_size)]) + # flattern to be idx for all (B * num_shuffled_patches) + puzzle_shuffle_indices = group_ids_shuffle.view(-1) + + # 将0~B * num_shuffled_patches-1(包括0和B * num_shuffled_patches-1)随机打乱后获得的数字序列 + x_puzzle = x_puzzle.view(B * num_shuffled_patches, D)[puzzle_shuffle_indices].view(B, num_shuffled_patches, D) + # 利用randperm获得的乱序序列对应batch内所有需要shuffle的部分进行打乱顺序,之后将其恢复为原本的划分batch + # pack up all puzzle patches + x = torch.cat([x_fixed, x_puzzle], dim=1) + + # generate the binary mask: 0 is keep, 1 is remove + mask = torch.ones([B, num_puzzle_patches, D], device=x.device, requires_grad=False) # no grad + mask[:, :len_fix_position, :] = 0 # set the first len_fix of tokens to 0,rest to 1 + + # unshuffle to restore the fixed positions + x = torch.gather(x, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, D)) + # torch.gather to generate restored binary mask + mask = torch.gather(mask, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, D)) + + # unpatchify to obtain puzzle images and their mask + x = self.unpatchify(x, puzzle_patch_size) + mask = self.unpatchify(mask, puzzle_patch_size) + + return x, mask # x_puzzled and mask + + def forward_puzzle(self, imgs, fix_position_ratio=0.25, puzzle_patch_size=32): + """ + Transform the input images to puzzle images + + input: + x: [B, 3, H, W], input image tensor + fix_position_ratio float + puzzle_patch_size int + + output: x_puzzled, mask + x_puzzled: [B, 3, H, W] + mask: [B, 3, H, W], binary mask indicating pix position with 0 + """ + x_puzzled, mask = self.fix_position_shuffling(imgs, fix_position_ratio, puzzle_patch_size) + return x_puzzled, mask + + def forward_encoder(self, imgs): + """ + :param imgs: [B, C, H, W], sequence of imgs + + :return: Encoder output: encoded tokens, mask position, restore idxs + x: [B, num_patches, D], sequence of Tokens (including the cls token) + CLS_token: [B, 1, D] + """ + + if self.prompt_mode is None: # ViT + # embed patches + x = self.patch_embed(imgs) + + # add pos embed before concatenate the cls token + x = x + self.pos_embed[:, 1:, :] + + # detatch puzzle for embed_puzzle output + embed_puzzle = x.data.detach() + + # append cls token + cls_token = self.cls_token + self.pos_embed[:, :1, :] + cls_tokens = cls_token.expand(x.shape[0], -1, -1) # batch fix + x = torch.cat((cls_tokens, x), dim=1) + + # apply Transformer blocks + for blk in self.blocks: + x = blk(x) + + else: # VPT + x = self.patch_embed(imgs) + # add pos embed before concatenate the cls token + x = x + self.pos_embed[:, 1:, :] + + # detatch puzzle for embed_puzzle output + embed_puzzle = x.data.detach() # copy the embed original puzzle (for illustration) + + # append cls token + cls_token = self.cls_token + self.pos_embed[:, :1, :] + cls_tokens = cls_token.expand(x.shape[0], -1, -1) # batch fix + x = torch.cat((cls_tokens, x), dim=1) + + if self.VPT_type == "Deep": + + Prompt_Token_num = self.Prompt_Tokens.shape[1] + + for i in range(len(self.blocks)): + # concatenate Prompt_Tokens + Prompt_Tokens = self.Prompt_Tokens[i].unsqueeze(0) + # firstly concatenate + x = torch.cat((x, Prompt_Tokens.expand(x.shape[0], -1, -1)), dim=1) + num_tokens = x.shape[1] + # lastly remove, a good trick + x = self.blocks[i](x)[:, :num_tokens - Prompt_Token_num] + + else: # self.VPT_type == "Shallow" + Prompt_Token_num = self.Prompt_Tokens.shape[1] + + # concatenate Prompt_Tokens + Prompt_Tokens = self.Prompt_Tokens.expand(x.shape[0], -1, -1) + x = torch.cat((x, Prompt_Tokens), dim=1) + num_tokens = x.shape[1] + # A whole sequential process + x = self.blocks(x)[:, :num_tokens - Prompt_Token_num] + + # last norm of Transformer + x = self.norm(x) + + CLS_token = x[:, :1, :] + x = x[:, 1:, :] + + # Encoder output: encoded tokens, mask position, embed original puzzle (for illustration) + return x, CLS_token, embed_puzzle + + def forward_decoder(self, x): + """ + Decoder to reconstruct the puzzle image + [B, 1 + num_patches, D_Encoder] -> [B, 1 + num_patches, D_Decoder] -> [B, num_patches, p*p*3] + + :param x: [B, 1 + num_patches, D_Encoder], sequence of Tokens (including the cls token) + + :return: Decoder output: reconstracted tokens + x: [B, num_patches, patch_size ** 2 * in_chans], sequence of Patch Tokens + """ + + if self.decoder is None: + # embed tokens: [B, num_encoded_tokens, D_Encoder] -> [B, num_encoded_tokens, D_Decoder] + x = self.decoder_embed(x) + # print(x.shape) + # add pos embed + x = x + self.decoder_pos_embed + + # apply Transformer blocks + for blk in self.decoder_blocks: + x = blk(x) + x = self.decoder_norm(x) + + # Reconstruction projection + x = self.decoder_pred(x) + # remove cls token + x = x[:, 1:, :] + # print("x shape: ", x.shape) # [B, N, p*p*3] + + else: + # remove cls token + x = x[:, 1:, :] + # embed tokens: [B, num_encoded_tokens, D_Encoder] -> [B, num_encoded_tokens, D_Decoder] + x = self.decoder_embed(x) + # unpatchify to make image form [B, H, W, C] + x = self.unpatchify(x) # restore image by Encoder + # apply decoder module to segment the output of encoder + x = self.decoder(x) # one-hot seg decoder [B, CLS, H, W] + # the output of segmentation is transformed to [B, N, Dec] + x = self.patchify_decoder(x) # TODO 做一个有意义的设计 + # Convert the number of channels to match image for loss function + x = self.decoder_pred(x) # [B, N, Dec] -> [B, N, p*p*3] + # print(x.shape) + + return x + + def forward_loss(self, imgs, pred, mask): + """ + MSE loss for all patches towards the ori image + + Input: + imgs: [B, 3, H, W], Encoder input image + pred: [B, num_patches, p*p*3], Decoder reconstructed image + mask: [B, num_patches, p*p*3], 0 is keep, 1 is puzzled + + """ + # print("pred shape: ", pred.shape) # [64, 196, 768] + # target imgs: [B, 3, H, W] -> [B, num_patches, p*p*3] + target = self.patchify(imgs) + # print("target shape: ", target.shape) # [64, 196, 768] + # use mask as a patch indicator [B, num_patches, D] -> [B, num_patches] + mask = mask[:, :, 0] # Binary mask, 1 for removed patches, 0 for reserved patches: + + if self.norm_pix_loss: # Normalize the target image patches + mean = target.mean(dim=-1, keepdim=True) + var = target.var(dim=-1, keepdim=True) + target = (target - mean) / (var + 1.e-6) ** .5 + + # MSE loss + loss = (pred - target) ** 2 + loss = loss.mean(dim=-1) # [B, num_patches], mean loss on each patch pixel + + loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches [B], scalar + + return loss + + def forward(self, imgs, fix_position_ratio=0.25, puzzle_patch_size=32, combined_pred_illustration=False): + # STEP 1: Puzzle making + # create puzzle images: [B, 3, H, W] + imgs_puzzled, mask = self.forward_puzzle(imgs, fix_position_ratio, puzzle_patch_size) + + # Visualization of imgs_puzzled_patches sequence: [B, num_patches, p*p*3] + imgs_puzzled_patches = self.patchify(imgs_puzzled) + # here, latent crop size is automatically based on encoder embedding + + # STEP 2: Puzzle understanding + # Encoder to obtain latent tokens and embed_puzzle: [B, num_patches, D] + latent_puzzle, CLS_token, embed_puzzle = self.forward_encoder(imgs_puzzled) + # VPT output size of more tokens ? currently use firstly-cat-lastly-remove so its fine + + # STEP 3: Puzzle restoring + + # step 3.(a) prepare decoder input indcator mask at the encoder output stage: + mask_patches_pp3 = self.patchify(mask) # mark relation tokens with 1 [B, num_patches, p*p*3] + # here, latent crop size is automatically based on encoder embedding + + # Reassign mask indicator shape to the encoder output dim + if mask_patches_pp3.shape[-1] != latent_puzzle.shape[-1]: + # [B, num_patches, p*p*3] -> [B, num_patches, 1] -> [B, num_patches, D] + mask_patches = mask_patches_pp3[:, :, :1].expand(-1, -1, latent_puzzle.shape[-1]) + else: + mask_patches = mask_patches_pp3 + + # anti_mask: [B, num_patches, D], binary mask indicating fix position with 1 instead of 0 + anti_mask = mask_patches * -1 + 1 # great trick to process positional operation with less calculation + + # Position hint + # in mask, 0 is Position Tokens, therefore take only Relation Tokens + latent_tokens = latent_puzzle * mask_patches # take out relation tokens(latent_tokens here) + # in anti_mask, 0 is Relation Tokens, therefore take only Position Tokens + hint_tokens = embed_puzzle * anti_mask # anti_mask to take hint_tokens (position tokens) + # group decoder tokens: [B, num_patches, D] + latent = latent_tokens + hint_tokens + # append back the cls token at the first -> [B, 1+num_patches, D] + x = torch.cat([CLS_token, latent], dim=1) + + # step 3.(b) Decoder to obtain Reconstructed image patches: + # [B, 1+num_patches,D] -> [B, 1+num_patches, D_Decoder] -> [B, num_patches, p*p*3] + pred = self.forward_decoder(x) + + # combined pred + anti_mask_patches_pp3 = mask_patches_pp3 * -1 + 1 # fix position with 1, relation patches with 0 + hint_img_patches = imgs_puzzled_patches * anti_mask_patches_pp3 + pred_img_patches = pred * mask_patches_pp3 # mark relation tokens with 1, fix position with 0 + pred_with_hint_imgs = hint_img_patches + pred_img_patches + + # MSE loss for all patches towards the ori image + loss = self.forward_loss(imgs, pred, mask_patches) + # print(loss) # check whether the loss is working + + if combined_pred_illustration: + return loss, pred_with_hint_imgs, imgs_puzzled_patches + else: + return loss, pred, imgs_puzzled_patches + + +def sae_vit_base_patch16_dec512d8b(dec_idx=None, **kwargs): + print("Decoder:", dec_idx) + + model = ShuffledAutoEncoderViT( + patch_size=16, embed_dim=768, depth=12, num_heads=12, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def sae_vit_large_patch16_dec512d8b(dec_idx=None, **kwargs): + print("Decoder:", dec_idx) + + model = ShuffledAutoEncoderViT( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def sae_vit_huge_patch14_dec512d8b(dec_idx=None, **kwargs): + print("Decoder:", dec_idx) + + model = ShuffledAutoEncoderViT( + patch_size=14, embed_dim=1280, depth=32, num_heads=16, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +# decoder +def sae_vit_base_patch16_dec(dec_idx=None, num_classes=3, img_size=224, **kwargs): + # num_classes做的是one-hot seg但是不是做还原,我们得设计一下如何去做这个还原才能实现预训练 + + if dec_idx == 'swin_unet': + decoder_embed_dim = 768 + decoder_rep_dim = 16 * 16 * 3 + + from SSL_structures.Swin_Unet_main.networks.vision_transformer import SwinUnet as ViT_seg + decoder = ViT_seg(num_classes=num_classes, img_size=img_size, patch_size=16) + + elif dec_idx == 'transunet': + decoder_embed_dim = 768 + decoder_rep_dim = 16 * 16 * 3 + + transunet_name = 'R50-ViT-B_16' + transunet_patches_size = 16 + from SSL_structures.TransUNet_main.networks.vit_seg_modeling import CONFIGS as CONFIGS_Transunet_seg + from SSL_structures.TransUNet_main.networks.vit_seg_modeling import VisionTransformer as Transunet_seg + + config_vit = CONFIGS_Transunet_seg[transunet_name] + config_vit.n_classes = num_classes + config_vit.n_skip = 3 + + if transunet_name.find('R50') != -1: + config_vit.patches.grid = ( + int(img_size / transunet_patches_size), int(img_size / transunet_patches_size)) + decoder = Transunet_seg(config_vit, num_classes=config_vit.n_classes) + + elif dec_idx == 'UTNetV2': + decoder_embed_dim = 768 + decoder_rep_dim = 16 * 16 * 3 + + from SSL_structures.UtnetV2.utnetv2 import UTNetV2 as UTNetV2_seg + decoder = UTNetV2_seg(in_chan=3, num_classes=num_classes) + + else: + print('no effective decoder!') + return -1 + + print('dec_idx: ', dec_idx) + + model = ShuffledAutoEncoderViT( + patch_size=16, embed_dim=768, depth=12, num_heads=12, + decoder_embed_dim=decoder_embed_dim, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), decoder_rep_dim=decoder_rep_dim, decoder=decoder, + **kwargs) + return model + + +def sae_vit_large_patch16_dec(dec_idx=None, num_classes=3, img_size=224, **kwargs): + # num_classes做的是one-hot seg但是不是做还原,我们得设计一下如何去做这个还原才能实现预训练 + + if dec_idx == 'swin_unet': + decoder_embed_dim = 768 + decoder_rep_dim = 16 * 16 * 3 + + from SSL_structures.Swin_Unet_main.networks.vision_transformer import SwinUnet as ViT_seg + decoder = ViT_seg(num_classes=num_classes, img_size=img_size, patch_size=16) + + elif dec_idx == 'transunet': + decoder_embed_dim = 768 + decoder_rep_dim = 16 * 16 * 3 + + transunet_name = 'R50-ViT-B_16' + transunet_patches_size = 16 + from SSL_structures.TransUNet_main.networks.vit_seg_modeling import CONFIGS as CONFIGS_Transunet_seg + from SSL_structures.TransUNet_main.networks.vit_seg_modeling import VisionTransformer as Transunet_seg + + config_vit = CONFIGS_Transunet_seg[transunet_name] + config_vit.n_classes = num_classes + config_vit.n_skip = 3 + + if transunet_name.find('R50') != -1: + config_vit.patches.grid = ( + int(img_size / transunet_patches_size), int(img_size / transunet_patches_size)) + decoder = Transunet_seg(config_vit, num_classes=config_vit.n_classes) + + elif dec_idx == 'UTNetV2': + decoder_embed_dim = 768 + decoder_rep_dim = 16 * 16 * 3 + + from SSL_structures.UtnetV2.utnetv2 import UTNetV2 as UTNetV2_seg + decoder = UTNetV2_seg(in_chan=3, num_classes=num_classes) + + else: + print('no effective decoder!') + return -1 + + print('dec_idx: ', dec_idx) + + model = ShuffledAutoEncoderViT( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, + decoder_embed_dim=decoder_embed_dim, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), decoder_rep_dim=decoder_rep_dim, decoder=decoder, + **kwargs) + return model + + +def sae_vit_huge_patch14_dec(dec_idx=None, num_classes=3, img_size=224, **kwargs): + # num_classes做的是one-hot seg但是不是做还原,我们得设计一下如何去做这个还原才能实现预训练 + + if dec_idx == 'swin_unet': + decoder_embed_dim = 14 * 14 * 3 + decoder_rep_dim = 14 * 14 * 3 + + from SSL_structures.Swin_Unet_main.networks.vision_transformer import SwinUnet as ViT_seg + decoder = ViT_seg(num_classes=num_classes, img_size=img_size, patch_size=16) + + elif dec_idx == 'transunet': + decoder_embed_dim = 14 * 14 * 3 + decoder_rep_dim = 14 * 14 * 3 + + transunet_name = 'R50-ViT-B_16' + transunet_patches_size = 16 + from SSL_structures.TransUNet_main.networks.vit_seg_modeling import CONFIGS as CONFIGS_Transunet_seg + from SSL_structures.TransUNet_main.networks.vit_seg_modeling import VisionTransformer as Transunet_seg + + config_vit = CONFIGS_Transunet_seg[transunet_name] + config_vit.n_classes = num_classes + config_vit.n_skip = 3 + + if transunet_name.find('R50') != -1: + config_vit.patches.grid = ( + int(img_size / transunet_patches_size), int(img_size / transunet_patches_size)) + decoder = Transunet_seg(config_vit, num_classes=config_vit.n_classes) + + elif dec_idx == 'UTNetV2': + decoder_embed_dim = 14 * 14 * 3 + decoder_rep_dim = 14 * 14 * 3 + + from SSL_structures.UtnetV2.utnetv2 import UTNetV2 as UTNetV2_seg + decoder = UTNetV2_seg(in_chan=3, num_classes=num_classes) + + else: + print('no effective decoder!') + return -1 + + print('dec_idx: ', dec_idx) + + model = ShuffledAutoEncoderViT( + patch_size=14, embed_dim=1280, depth=32, num_heads=16, + decoder_embed_dim=decoder_embed_dim, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), decoder_rep_dim=decoder_rep_dim, decoder=decoder, + **kwargs) + return model + + +# set recommended archs following MAE +sae_vit_base_patch16 = sae_vit_base_patch16_dec512d8b # decoder: 512 dim, 8 blocks +sae_vit_large_patch16 = sae_vit_large_patch16_dec512d8b # decoder: 512 dim, 8 blocks +sae_vit_huge_patch14 = sae_vit_huge_patch14_dec512d8b # decoder: 512 dim, 8 blocks + +# Equiped with decoders +sae_vit_base_patch16_decoder = sae_vit_base_patch16_dec # decoder: 768 dim, HYF decoders +sae_vit_large_patch16_decoder = sae_vit_large_patch16_dec # decoder: 768 dim, HYF decoders +sae_vit_huge_patch14_decoder = sae_vit_huge_patch14_dec # decoder: 768 dim, HYF decoders + +if __name__ == '__main__': + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + img_size = 224 + + ''' + num_classes = 3 # set to 3 for 3 channel + x = torch.rand(2, 3, img_size, img_size, device=device) + ''' + + image_tensor_path = './temp-tensors/color.pt' + x = torch.load(image_tensor_path) + x.to(device) + + # model = sae_vit_base_patch16(img_size=img_size, decoder=None) + # model = sae_vit_huge_patch14(img_size=img_size, decoder=None) + # model = sae_vit_base_patch16_decoder(prompt_mode="Deep", dec_idx='swin_unet', img_size=img_size) + model = sae_vit_base_patch16(img_size=img_size, decoder=None, group_shuffle_size=2) + + ''' + # ViT_Prompt + + from pprint import pprint + model_names = timm.list_models('*vit*') + pprint(model_names) + + basic_model = timm.create_model('vit_base_patch' + str(16) + '_' + str(edge_size), pretrained=True) + + basic_state_dict = basic_model.state_dict() + + model = sae_vit_base_patch16(img_size=384, prompt_mode='Deep', Prompt_Token_num=20, basic_state_dict=basic_state_dict) + + prompt_state_dict = model.obtain_prompt() + VPT = VPT_ViT(img_size=384, VPT_type='Deep', Prompt_Token_num=20, basic_state_dict=basic_state_dict) + VPT.load_prompt(prompt_state_dict) + VPT.to(device) + pred = VPT(x) + print(pred) + ''' + + model.to(device) + + loss, pred, imgs_puzzled_patches = model(x, fix_position_ratio=0.25, puzzle_patch_size=32, + combined_pred_illustration=True) + # combined_pred_illustration = True to add hint tokens at the pred, False to know more info + + + # 可视化看看效果 + from utils.visual_usage import * + + imgs_puzzled_batch = unpatchify(imgs_puzzled_patches, patch_size=16) + for img_idx in range(len(imgs_puzzled_batch)): + puzzled_img = imgs_puzzled_batch.cpu()[img_idx] + puzzled_img = ToPILImage()(puzzled_img) + puzzled_img.save(os.path.join('./temp-figs/', 'puzzled_sample_'+str(img_idx)+'.jpg')) + + recons_img_batch = unpatchify(pred, patch_size=16) + recons_img = recons_img_batch.cpu()[img_idx] + recons_img = ToPILImage()(recons_img) + recons_img.save(os.path.join('./temp-figs/', 'recons_sample_'+str(img_idx)+'.jpg')) + ''' + + print(loss, '\n') + + print(loss.shape, '\n') + + print(pred.shape, '\n') + + print(imgs_puzzled_patches.shape, '\n') + ''' \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/README.md b/PuzzleTuning/SSL_structures/Swin_Unet_main/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5c549eff9061a3be6aca484330161eff3acc5dd9 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/README.md @@ -0,0 +1,46 @@ +# Swin-Unet +The codes for the work "Swin-Unet: Unet-like Pure Transformer for Medical Image Segmentation"(https://arxiv.org/abs/2105.05537). A validation for U-shaped Swin Transformer. + +## 1. Download pre-trained swin transformer model (Swin-T) +* [Get pre-trained model in this link] (https://drive.google.com/drive/folders/1UC3XOoezeum0uck4KBVGa8osahs6rKUY?usp=sharing): Put pretrained Swin-T into folder "pretrained_ckpt/" + +## 2. Prepare data + +- The datasets we used are provided by TransUnet's authors. Please go to ["./datasets/README.md"](datasets/README.md) for details, or please send an Email to jienengchen01 AT gmail.com to request the preprocessed data. If you would like to use the preprocessed data, please use it for research purposes and do not redistribute it (following the TransUnet's License). + +## 3. Environment + +- Please prepare an environment with python=3.7, and then use the command "pip install -r requirements.txt" for the dependencies. + +## 4. Train/Test + +- Run the train script on synapse dataset. The batch size we used is 24. If you do not have enough GPU memory, the bacth size can be reduced to 12 or 6 to save memory. + +- Train + +```bash +sh train.sh or python train.py --dataset Synapse --cfg configs/swin_tiny_patch4_window7_224_lite.yaml --root_path your DATA_DIR --max_epochs 150 --output_dir your OUT_DIR --img_size 224 --base_lr 0.05 --batch_size 24 +``` + +- Test + +```bash +sh test.sh or python test.py --dataset Synapse --cfg configs/swin_tiny_patch4_window7_224_lite.yaml --is_saveni --volume_path your DATA_DIR --output_dir your OUT_DIR --max_epoch 150 --base_lr 0.05 --img_size 224 --batch_size 24 +``` + +## References +* [TransUnet](https://github.com/Beckschen/TransUNet) +* [SwinTransformer](https://github.com/microsoft/Swin-Transformer) + +## Citation + +```bibtex +@misc{cao2021swinunet, + title={Swin-Unet: Unet-like Pure Transformer for Medical Image Segmentation}, + author={Hu Cao and Yueyue Wang and Joy Chen and Dongsheng Jiang and Xiaopeng Zhang and Qi Tian and Manning Wang}, + year={2021}, + eprint={2105.05537}, + archivePrefix={arXiv}, + primaryClass={eess.IV} +} +``` diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/config.py b/PuzzleTuning/SSL_structures/Swin_Unet_main/config.py new file mode 100644 index 0000000000000000000000000000000000000000..35bf199689555f3fc61fbee04daa2331f7efc181 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/config.py @@ -0,0 +1,229 @@ +# -------------------------------------------------------- +# Swin Transformer +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu +# --------------------------------------------------------' + +import os +import yaml +from yacs.config import CfgNode as CN + +_C = CN() + +# Base config files +_C.BASE = [''] + +# ----------------------------------------------------------------------------- +# Data settings +# ----------------------------------------------------------------------------- +_C.DATA = CN() +# Batch size for a single GPU, could be overwritten by command line argument +_C.DATA.BATCH_SIZE = 128 +# Path to dataset, could be overwritten by command line argument +_C.DATA.DATA_PATH = '' +# Dataset name +_C.DATA.DATASET = 'imagenet' +# Input image size +_C.DATA.IMG_SIZE = 224 +# Interpolation to resize image (random, bilinear, bicubic) +_C.DATA.INTERPOLATION = 'bicubic' +# Use zipped dataset instead of folder dataset +# could be overwritten by command line argument +_C.DATA.ZIP_MODE = False +# Cache Data in Memory, could be overwritten by command line argument +_C.DATA.CACHE_MODE = 'part' +# Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU. +_C.DATA.PIN_MEMORY = True +# Number of data loading threads +_C.DATA.NUM_WORKERS = 8 + +# ----------------------------------------------------------------------------- +# Model settings +# ----------------------------------------------------------------------------- +_C.MODEL = CN() +# Model type +_C.MODEL.TYPE = 'swin' +# Model name +_C.MODEL.NAME = 'swin_tiny_patch4_window7_224' +# Checkpoint to resume, could be overwritten by command line argument +_C.MODEL.PRETRAIN_CKPT = './pretrained_ckpt/swin_tiny_patch4_window7_224.pth' +_C.MODEL.RESUME = '' +# Number of classes, overwritten in data preparation +_C.MODEL.NUM_CLASSES = 1000 +# Dropout rate +_C.MODEL.DROP_RATE = 0.0 +# Drop path rate +_C.MODEL.DROP_PATH_RATE = 0.1 +# Label Smoothing +_C.MODEL.LABEL_SMOOTHING = 0.1 + +# Swin Transformer parameters +_C.MODEL.SWIN = CN() +_C.MODEL.SWIN.PATCH_SIZE = 4 +_C.MODEL.SWIN.IN_CHANS = 3 +_C.MODEL.SWIN.EMBED_DIM = 96 +_C.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] +_C.MODEL.SWIN.DECODER_DEPTHS = [2, 2, 6, 2] +_C.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] +_C.MODEL.SWIN.WINDOW_SIZE = 7 +_C.MODEL.SWIN.MLP_RATIO = 4. +_C.MODEL.SWIN.QKV_BIAS = True +_C.MODEL.SWIN.QK_SCALE = None +_C.MODEL.SWIN.APE = False +_C.MODEL.SWIN.PATCH_NORM = True +_C.MODEL.SWIN.FINAL_UPSAMPLE= "expand_first" + +# ----------------------------------------------------------------------------- +# Training settings +# ----------------------------------------------------------------------------- +_C.TRAIN = CN() +_C.TRAIN.START_EPOCH = 0 +_C.TRAIN.EPOCHS = 300 +_C.TRAIN.WARMUP_EPOCHS = 20 +_C.TRAIN.WEIGHT_DECAY = 0.05 +_C.TRAIN.BASE_LR = 5e-4 +_C.TRAIN.WARMUP_LR = 5e-7 +_C.TRAIN.MIN_LR = 5e-6 +# Clip gradient norm +_C.TRAIN.CLIP_GRAD = 5.0 +# Auto resume from latest checkpoint +_C.TRAIN.AUTO_RESUME = True +# Gradient accumulation steps +# could be overwritten by command line argument +_C.TRAIN.ACCUMULATION_STEPS = 0 +# Whether to use gradient checkpointing to save memory +# could be overwritten by command line argument +_C.TRAIN.USE_CHECKPOINT = False + +# LR scheduler +_C.TRAIN.LR_SCHEDULER = CN() +_C.TRAIN.LR_SCHEDULER.NAME = 'cosine' +# Epoch interval to decay LR, used in StepLRScheduler +_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 +# LR decay rate, used in StepLRScheduler +_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 + +# Optimizer +_C.TRAIN.OPTIMIZER = CN() +_C.TRAIN.OPTIMIZER.NAME = 'adamw' +# Optimizer Epsilon +_C.TRAIN.OPTIMIZER.EPS = 1e-8 +# Optimizer Betas +_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999) +# SGD momentum +_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9 + +# ----------------------------------------------------------------------------- +# Augmentation settings +# ----------------------------------------------------------------------------- +_C.AUG = CN() +# Color jitter factor +_C.AUG.COLOR_JITTER = 0.4 +# Use AutoAugment policy. "v0" or "original" +_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' +# Random erase prob +_C.AUG.REPROB = 0.25 +# Random erase mode +_C.AUG.REMODE = 'pixel' +# Random erase count +_C.AUG.RECOUNT = 1 +# Mixup alpha, mixup enabled if > 0 +_C.AUG.MIXUP = 0.8 +# Cutmix alpha, cutmix enabled if > 0 +_C.AUG.CUTMIX = 1.0 +# Cutmix min/max ratio, overrides alpha and enables cutmix if set +_C.AUG.CUTMIX_MINMAX = None +# Probability of performing mixup or cutmix when either/both is enabled +_C.AUG.MIXUP_PROB = 1.0 +# Probability of switching to cutmix when both mixup and cutmix enabled +_C.AUG.MIXUP_SWITCH_PROB = 0.5 +# How to apply mixup/cutmix params. Per "batch", "pair", or "elem" +_C.AUG.MIXUP_MODE = 'batch' + +# ----------------------------------------------------------------------------- +# Testing settings +# ----------------------------------------------------------------------------- +_C.TEST = CN() +# Whether to use center crop when testing +_C.TEST.CROP = True + +# ----------------------------------------------------------------------------- +# Misc +# ----------------------------------------------------------------------------- +# Mixed precision opt level, if O0, no amp is used ('O0', 'O1', 'O2') +# overwritten by command line argument +_C.AMP_OPT_LEVEL = '' +# Path to output folder, overwritten by command line argument +_C.OUTPUT = '' +# Tag of experiment, overwritten by command line argument +_C.TAG = 'default' +# Frequency to save checkpoint +_C.SAVE_FREQ = 1 +# Frequency to logging info +_C.PRINT_FREQ = 10 +# Fixed random seed +_C.SEED = 0 +# Perform evaluation only, overwritten by command line argument +_C.EVAL_MODE = False +# Test throughput only, overwritten by command line argument +_C.THROUGHPUT_MODE = False +# local rank for DistributedDataParallel, given by command line argument +_C.LOCAL_RANK = 0 + + +def _update_config_from_file(config, cfg_file): + config.defrost() + with open(cfg_file, 'r') as f: + yaml_cfg = yaml.load(f, Loader=yaml.FullLoader) + + for cfg in yaml_cfg.setdefault('BASE', ['']): + if cfg: + _update_config_from_file( + config, os.path.join(os.path.dirname(cfg_file), cfg) + ) + print('=> merge config from {}'.format(cfg_file)) + config.merge_from_file(cfg_file) + config.freeze() + + +def update_config(config, args): + _update_config_from_file(config, args.cfg) + + config.defrost() + if args.opts: + config.merge_from_list(args.opts) + + # merge from specific arguments + if args.batch_size: + config.DATA.BATCH_SIZE = args.batch_size + if args.zip: + config.DATA.ZIP_MODE = True + if args.cache_mode: + config.DATA.CACHE_MODE = args.cache_mode + if args.resume: + config.MODEL.RESUME = args.resume + if args.accumulation_steps: + config.TRAIN.ACCUMULATION_STEPS = args.accumulation_steps + if args.use_checkpoint: + config.TRAIN.USE_CHECKPOINT = True + if args.amp_opt_level: + config.AMP_OPT_LEVEL = args.amp_opt_level + if args.tag: + config.TAG = args.tag + if args.eval: + config.EVAL_MODE = True + if args.throughput: + config.THROUGHPUT_MODE = True + + config.freeze() + + +def get_config(args): + """Get a yacs CfgNode object with default values.""" + # Return a clone so that the defaults will not be altered + # This is for the "local variable" use pattern + config = _C.clone() + update_config(config, args) + + return config diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/configs/swin_tiny_patch4_window7_224_lite.yaml b/PuzzleTuning/SSL_structures/Swin_Unet_main/configs/swin_tiny_patch4_window7_224_lite.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e25bd2b1578132dd205dd7be6d46eb56376a0b07 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/configs/swin_tiny_patch4_window7_224_lite.yaml @@ -0,0 +1,12 @@ +MODEL: + TYPE: swin + NAME: swin_tiny_patch4_window7_224 + DROP_PATH_RATE: 0.2 + PRETRAIN_CKPT: "./pretrained_ckpt/swin_tiny_patch4_window7_224.pth" + SWIN: + FINAL_UPSAMPLE: "expand_first" + EMBED_DIM: 96 + DEPTHS: [ 2, 2, 2, 2 ] + DECODER_DEPTHS: [ 2, 2, 2, 1] + NUM_HEADS: [ 3, 6, 12, 24 ] + WINDOW_SIZE: 7 \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/datasets/README.md b/PuzzleTuning/SSL_structures/Swin_Unet_main/datasets/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c662f8e2c24f9b5a899338cfe5fcd67f43d1bba5 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/datasets/README.md @@ -0,0 +1,29 @@ +# Data Preparing + +1. Access to the synapse multi-organ dataset: + 1. Sign up in the [official Synapse website](https://www.synapse.org/#!Synapse:syn3193805/wiki/) and download the dataset. Convert them to numpy format, clip the images within [-125, 275], normalize each 3D image to [0, 1], and extract 2D slices from 3D volume for training cases while keeping the 3D volume in h5 format for testing cases. + 2. You can also send an Email directly to jienengchen01 AT gmail.com to request the preprocessed data for reproduction. +2. The directory structure of the whole project is as follows: + +```bash +. +├── TransUNet +│   ├──datasets +│   │    └── dataset_*.py +│   ├──train.py +│   ├──test.py +│   └──... +├── model +│   └── vit_checkpoint +│   └── imagenet21k +│      ├── R50+ViT-B_16.npz +│      └── *.npz +└── data + └──Synapse + ├── test_vol_h5 + │   ├── case0001.npy.h5 + │   └── *.npy.h5 + └── train_npz + ├── case0005_slice000.npz + └── *.npz +``` diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/datasets/dataset_synapse.py b/PuzzleTuning/SSL_structures/Swin_Unet_main/datasets/dataset_synapse.py new file mode 100644 index 0000000000000000000000000000000000000000..c5d0de1a99f8ca46851f51e45570d4ddc8fbff09 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/datasets/dataset_synapse.py @@ -0,0 +1,75 @@ +import os +import random +import h5py +import numpy as np +import torch +from scipy import ndimage +from scipy.ndimage.interpolation import zoom +from torch.utils.data import Dataset + + +def random_rot_flip(image, label): + k = np.random.randint(0, 4) + image = np.rot90(image, k) + label = np.rot90(label, k) + axis = np.random.randint(0, 2) + image = np.flip(image, axis=axis).copy() + label = np.flip(label, axis=axis).copy() + return image, label + + +def random_rotate(image, label): + angle = np.random.randint(-20, 20) + image = ndimage.rotate(image, angle, order=0, reshape=False) + label = ndimage.rotate(label, angle, order=0, reshape=False) + return image, label + + +class RandomGenerator(object): + def __init__(self, output_size): + self.output_size = output_size + + def __call__(self, sample): + image, label = sample['image'], sample['label'] + + if random.random() > 0.5: + image, label = random_rot_flip(image, label) + elif random.random() > 0.5: + image, label = random_rotate(image, label) + x, y = image.shape + if x != self.output_size[0] or y != self.output_size[1]: + image = zoom(image, (self.output_size[0] / x, self.output_size[1] / y), order=3) # why not 3? + label = zoom(label, (self.output_size[0] / x, self.output_size[1] / y), order=0) + image = torch.from_numpy(image.astype(np.float32)).unsqueeze(0) + label = torch.from_numpy(label.astype(np.float32)) + sample = {'image': image, 'label': label.long()} + return sample + + +class Synapse_dataset(Dataset): + def __init__(self, base_dir, list_dir, split, transform=None): + self.transform = transform # using transform in torch! + self.split = split + self.sample_list = open(os.path.join(list_dir, self.split+'.txt')).readlines() + self.data_dir = base_dir + + def __len__(self): + return len(self.sample_list) + + def __getitem__(self, idx): + if self.split == "train": + slice_name = self.sample_list[idx].strip('\n') + data_path = os.path.join(self.data_dir, slice_name+'.npz') + data = np.load(data_path) + image, label = data['image'], data['label'] + else: + vol_name = self.sample_list[idx].strip('\n') + filepath = self.data_dir + "/{}.npy.h5".format(vol_name) + data = h5py.File(filepath) + image, label = data['image'][:], data['label'][:] + + sample = {'image': image, 'label': label} + if self.transform: + sample = self.transform(sample) + sample['case_name'] = self.sample_list[idx].strip('\n') + return sample diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/lists/lists_Synapse/all.lst b/PuzzleTuning/SSL_structures/Swin_Unet_main/lists/lists_Synapse/all.lst new file mode 100644 index 0000000000000000000000000000000000000000..6ef047d4b8be2ea61d1621620e420a6f3c974ec2 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/lists/lists_Synapse/all.lst @@ -0,0 +1,30 @@ +case0031.npy.h5 +case0007.npy.h5 +case0009.npy.h5 +case0005.npy.h5 +case0026.npy.h5 +case0039.npy.h5 +case0024.npy.h5 +case0034.npy.h5 +case0033.npy.h5 +case0030.npy.h5 +case0023.npy.h5 +case0040.npy.h5 +case0010.npy.h5 +case0021.npy.h5 +case0006.npy.h5 +case0027.npy.h5 +case0028.npy.h5 +case0037.npy.h5 +case0008.npy.h5 +case0022.npy.h5 +case0038.npy.h5 +case0036.npy.h5 +case0032.npy.h5 +case0002.npy.h5 +case0029.npy.h5 +case0003.npy.h5 +case0001.npy.h5 +case0004.npy.h5 +case0025.npy.h5 +case0035.npy.h5 diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/lists/lists_Synapse/test_vol.txt b/PuzzleTuning/SSL_structures/Swin_Unet_main/lists/lists_Synapse/test_vol.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c4abd53044eed5457fd1f7e0cca1c99e7222593 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/lists/lists_Synapse/test_vol.txt @@ -0,0 +1,12 @@ +case0008 +case0022 +case0038 +case0036 +case0032 +case0002 +case0029 +case0003 +case0001 +case0004 +case0025 +case0035 diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/lists/lists_Synapse/train.txt b/PuzzleTuning/SSL_structures/Swin_Unet_main/lists/lists_Synapse/train.txt new file mode 100644 index 0000000000000000000000000000000000000000..e58616844994a95407d1f664b79cd4e4533d41b8 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/lists/lists_Synapse/train.txt @@ -0,0 +1,2211 @@ +case0031_slice000 +case0031_slice001 +case0031_slice002 +case0031_slice003 +case0031_slice004 +case0031_slice005 +case0031_slice006 +case0031_slice007 +case0031_slice008 +case0031_slice009 +case0031_slice010 +case0031_slice011 +case0031_slice012 +case0031_slice013 +case0031_slice014 +case0031_slice015 +case0031_slice016 +case0031_slice017 +case0031_slice018 +case0031_slice019 +case0031_slice020 +case0031_slice021 +case0031_slice022 +case0031_slice023 +case0031_slice024 +case0031_slice025 +case0031_slice026 +case0031_slice027 +case0031_slice028 +case0031_slice029 +case0031_slice030 +case0031_slice031 +case0031_slice032 +case0031_slice033 +case0031_slice034 +case0031_slice035 +case0031_slice036 +case0031_slice037 +case0031_slice038 +case0031_slice039 +case0031_slice040 +case0031_slice041 +case0031_slice042 +case0031_slice043 +case0031_slice044 +case0031_slice045 +case0031_slice046 +case0031_slice047 +case0031_slice048 +case0031_slice049 +case0031_slice050 +case0031_slice051 +case0031_slice052 +case0031_slice053 +case0031_slice054 +case0031_slice055 +case0031_slice056 +case0031_slice057 +case0031_slice058 +case0031_slice059 +case0031_slice060 +case0031_slice061 +case0031_slice062 +case0031_slice063 +case0031_slice064 +case0031_slice065 +case0031_slice066 +case0031_slice067 +case0031_slice068 +case0031_slice069 +case0031_slice070 +case0031_slice071 +case0031_slice072 +case0031_slice073 +case0031_slice074 +case0031_slice075 +case0031_slice076 +case0031_slice077 +case0031_slice078 +case0031_slice079 +case0031_slice080 +case0031_slice081 +case0031_slice082 +case0031_slice083 +case0031_slice084 +case0031_slice085 +case0031_slice086 +case0031_slice087 +case0031_slice088 +case0031_slice089 +case0031_slice090 +case0031_slice091 +case0031_slice092 +case0007_slice000 +case0007_slice001 +case0007_slice002 +case0007_slice003 +case0007_slice004 +case0007_slice005 +case0007_slice006 +case0007_slice007 +case0007_slice008 +case0007_slice009 +case0007_slice010 +case0007_slice011 +case0007_slice012 +case0007_slice013 +case0007_slice014 +case0007_slice015 +case0007_slice016 +case0007_slice017 +case0007_slice018 +case0007_slice019 +case0007_slice020 +case0007_slice021 +case0007_slice022 +case0007_slice023 +case0007_slice024 +case0007_slice025 +case0007_slice026 +case0007_slice027 +case0007_slice028 +case0007_slice029 +case0007_slice030 +case0007_slice031 +case0007_slice032 +case0007_slice033 +case0007_slice034 +case0007_slice035 +case0007_slice036 +case0007_slice037 +case0007_slice038 +case0007_slice039 +case0007_slice040 +case0007_slice041 +case0007_slice042 +case0007_slice043 +case0007_slice044 +case0007_slice045 +case0007_slice046 +case0007_slice047 +case0007_slice048 +case0007_slice049 +case0007_slice050 +case0007_slice051 +case0007_slice052 +case0007_slice053 +case0007_slice054 +case0007_slice055 +case0007_slice056 +case0007_slice057 +case0007_slice058 +case0007_slice059 +case0007_slice060 +case0007_slice061 +case0007_slice062 +case0007_slice063 +case0007_slice064 +case0007_slice065 +case0007_slice066 +case0007_slice067 +case0007_slice068 +case0007_slice069 +case0007_slice070 +case0007_slice071 +case0007_slice072 +case0007_slice073 +case0007_slice074 +case0007_slice075 +case0007_slice076 +case0007_slice077 +case0007_slice078 +case0007_slice079 +case0007_slice080 +case0007_slice081 +case0007_slice082 +case0007_slice083 +case0007_slice084 +case0007_slice085 +case0007_slice086 +case0007_slice087 +case0007_slice088 +case0007_slice089 +case0007_slice090 +case0007_slice091 +case0007_slice092 +case0007_slice093 +case0007_slice094 +case0007_slice095 +case0007_slice096 +case0007_slice097 +case0007_slice098 +case0007_slice099 +case0007_slice100 +case0007_slice101 +case0007_slice102 +case0007_slice103 +case0007_slice104 +case0007_slice105 +case0007_slice106 +case0007_slice107 +case0007_slice108 +case0007_slice109 +case0007_slice110 +case0007_slice111 +case0007_slice112 +case0007_slice113 +case0007_slice114 +case0007_slice115 +case0007_slice116 +case0007_slice117 +case0007_slice118 +case0007_slice119 +case0007_slice120 +case0007_slice121 +case0007_slice122 +case0007_slice123 +case0007_slice124 +case0007_slice125 +case0007_slice126 +case0007_slice127 +case0007_slice128 +case0007_slice129 +case0007_slice130 +case0007_slice131 +case0007_slice132 +case0007_slice133 +case0007_slice134 +case0007_slice135 +case0007_slice136 +case0007_slice137 +case0007_slice138 +case0007_slice139 +case0007_slice140 +case0007_slice141 +case0007_slice142 +case0007_slice143 +case0007_slice144 +case0007_slice145 +case0007_slice146 +case0007_slice147 +case0007_slice148 +case0007_slice149 +case0007_slice150 +case0007_slice151 +case0007_slice152 +case0007_slice153 +case0007_slice154 +case0007_slice155 +case0007_slice156 +case0007_slice157 +case0007_slice158 +case0007_slice159 +case0007_slice160 +case0007_slice161 +case0007_slice162 +case0009_slice000 +case0009_slice001 +case0009_slice002 +case0009_slice003 +case0009_slice004 +case0009_slice005 +case0009_slice006 +case0009_slice007 +case0009_slice008 +case0009_slice009 +case0009_slice010 +case0009_slice011 +case0009_slice012 +case0009_slice013 +case0009_slice014 +case0009_slice015 +case0009_slice016 +case0009_slice017 +case0009_slice018 +case0009_slice019 +case0009_slice020 +case0009_slice021 +case0009_slice022 +case0009_slice023 +case0009_slice024 +case0009_slice025 +case0009_slice026 +case0009_slice027 +case0009_slice028 +case0009_slice029 +case0009_slice030 +case0009_slice031 +case0009_slice032 +case0009_slice033 +case0009_slice034 +case0009_slice035 +case0009_slice036 +case0009_slice037 +case0009_slice038 +case0009_slice039 +case0009_slice040 +case0009_slice041 +case0009_slice042 +case0009_slice043 +case0009_slice044 +case0009_slice045 +case0009_slice046 +case0009_slice047 +case0009_slice048 +case0009_slice049 +case0009_slice050 +case0009_slice051 +case0009_slice052 +case0009_slice053 +case0009_slice054 +case0009_slice055 +case0009_slice056 +case0009_slice057 +case0009_slice058 +case0009_slice059 +case0009_slice060 +case0009_slice061 +case0009_slice062 +case0009_slice063 +case0009_slice064 +case0009_slice065 +case0009_slice066 +case0009_slice067 +case0009_slice068 +case0009_slice069 +case0009_slice070 +case0009_slice071 +case0009_slice072 +case0009_slice073 +case0009_slice074 +case0009_slice075 +case0009_slice076 +case0009_slice077 +case0009_slice078 +case0009_slice079 +case0009_slice080 +case0009_slice081 +case0009_slice082 +case0009_slice083 +case0009_slice084 +case0009_slice085 +case0009_slice086 +case0009_slice087 +case0009_slice088 +case0009_slice089 +case0009_slice090 +case0009_slice091 +case0009_slice092 +case0009_slice093 +case0009_slice094 +case0009_slice095 +case0009_slice096 +case0009_slice097 +case0009_slice098 +case0009_slice099 +case0009_slice100 +case0009_slice101 +case0009_slice102 +case0009_slice103 +case0009_slice104 +case0009_slice105 +case0009_slice106 +case0009_slice107 +case0009_slice108 +case0009_slice109 +case0009_slice110 +case0009_slice111 +case0009_slice112 +case0009_slice113 +case0009_slice114 +case0009_slice115 +case0009_slice116 +case0009_slice117 +case0009_slice118 +case0009_slice119 +case0009_slice120 +case0009_slice121 +case0009_slice122 +case0009_slice123 +case0009_slice124 +case0009_slice125 +case0009_slice126 +case0009_slice127 +case0009_slice128 +case0009_slice129 +case0009_slice130 +case0009_slice131 +case0009_slice132 +case0009_slice133 +case0009_slice134 +case0009_slice135 +case0009_slice136 +case0009_slice137 +case0009_slice138 +case0009_slice139 +case0009_slice140 +case0009_slice141 +case0009_slice142 +case0009_slice143 +case0009_slice144 +case0009_slice145 +case0009_slice146 +case0009_slice147 +case0009_slice148 +case0005_slice000 +case0005_slice001 +case0005_slice002 +case0005_slice003 +case0005_slice004 +case0005_slice005 +case0005_slice006 +case0005_slice007 +case0005_slice008 +case0005_slice009 +case0005_slice010 +case0005_slice011 +case0005_slice012 +case0005_slice013 +case0005_slice014 +case0005_slice015 +case0005_slice016 +case0005_slice017 +case0005_slice018 +case0005_slice019 +case0005_slice020 +case0005_slice021 +case0005_slice022 +case0005_slice023 +case0005_slice024 +case0005_slice025 +case0005_slice026 +case0005_slice027 +case0005_slice028 +case0005_slice029 +case0005_slice030 +case0005_slice031 +case0005_slice032 +case0005_slice033 +case0005_slice034 +case0005_slice035 +case0005_slice036 +case0005_slice037 +case0005_slice038 +case0005_slice039 +case0005_slice040 +case0005_slice041 +case0005_slice042 +case0005_slice043 +case0005_slice044 +case0005_slice045 +case0005_slice046 +case0005_slice047 +case0005_slice048 +case0005_slice049 +case0005_slice050 +case0005_slice051 +case0005_slice052 +case0005_slice053 +case0005_slice054 +case0005_slice055 +case0005_slice056 +case0005_slice057 +case0005_slice058 +case0005_slice059 +case0005_slice060 +case0005_slice061 +case0005_slice062 +case0005_slice063 +case0005_slice064 +case0005_slice065 +case0005_slice066 +case0005_slice067 +case0005_slice068 +case0005_slice069 +case0005_slice070 +case0005_slice071 +case0005_slice072 +case0005_slice073 +case0005_slice074 +case0005_slice075 +case0005_slice076 +case0005_slice077 +case0005_slice078 +case0005_slice079 +case0005_slice080 +case0005_slice081 +case0005_slice082 +case0005_slice083 +case0005_slice084 +case0005_slice085 +case0005_slice086 +case0005_slice087 +case0005_slice088 +case0005_slice089 +case0005_slice090 +case0005_slice091 +case0005_slice092 +case0005_slice093 +case0005_slice094 +case0005_slice095 +case0005_slice096 +case0005_slice097 +case0005_slice098 +case0005_slice099 +case0005_slice100 +case0005_slice101 +case0005_slice102 +case0005_slice103 +case0005_slice104 +case0005_slice105 +case0005_slice106 +case0005_slice107 +case0005_slice108 +case0005_slice109 +case0005_slice110 +case0005_slice111 +case0005_slice112 +case0005_slice113 +case0005_slice114 +case0005_slice115 +case0005_slice116 +case0026_slice000 +case0026_slice001 +case0026_slice002 +case0026_slice003 +case0026_slice004 +case0026_slice005 +case0026_slice006 +case0026_slice007 +case0026_slice008 +case0026_slice009 +case0026_slice010 +case0026_slice011 +case0026_slice012 +case0026_slice013 +case0026_slice014 +case0026_slice015 +case0026_slice016 +case0026_slice017 +case0026_slice018 +case0026_slice019 +case0026_slice020 +case0026_slice021 +case0026_slice022 +case0026_slice023 +case0026_slice024 +case0026_slice025 +case0026_slice026 +case0026_slice027 +case0026_slice028 +case0026_slice029 +case0026_slice030 +case0026_slice031 +case0026_slice032 +case0026_slice033 +case0026_slice034 +case0026_slice035 +case0026_slice036 +case0026_slice037 +case0026_slice038 +case0026_slice039 +case0026_slice040 +case0026_slice041 +case0026_slice042 +case0026_slice043 +case0026_slice044 +case0026_slice045 +case0026_slice046 +case0026_slice047 +case0026_slice048 +case0026_slice049 +case0026_slice050 +case0026_slice051 +case0026_slice052 +case0026_slice053 +case0026_slice054 +case0026_slice055 +case0026_slice056 +case0026_slice057 +case0026_slice058 +case0026_slice059 +case0026_slice060 +case0026_slice061 +case0026_slice062 +case0026_slice063 +case0026_slice064 +case0026_slice065 +case0026_slice066 +case0026_slice067 +case0026_slice068 +case0026_slice069 +case0026_slice070 +case0026_slice071 +case0026_slice072 +case0026_slice073 +case0026_slice074 +case0026_slice075 +case0026_slice076 +case0026_slice077 +case0026_slice078 +case0026_slice079 +case0026_slice080 +case0026_slice081 +case0026_slice082 +case0026_slice083 +case0026_slice084 +case0026_slice085 +case0026_slice086 +case0026_slice087 +case0026_slice088 +case0026_slice089 +case0026_slice090 +case0026_slice091 +case0026_slice092 +case0026_slice093 +case0026_slice094 +case0026_slice095 +case0026_slice096 +case0026_slice097 +case0026_slice098 +case0026_slice099 +case0026_slice100 +case0026_slice101 +case0026_slice102 +case0026_slice103 +case0026_slice104 +case0026_slice105 +case0026_slice106 +case0026_slice107 +case0026_slice108 +case0026_slice109 +case0026_slice110 +case0026_slice111 +case0026_slice112 +case0026_slice113 +case0026_slice114 +case0026_slice115 +case0026_slice116 +case0026_slice117 +case0026_slice118 +case0026_slice119 +case0026_slice120 +case0026_slice121 +case0026_slice122 +case0026_slice123 +case0026_slice124 +case0026_slice125 +case0026_slice126 +case0026_slice127 +case0026_slice128 +case0026_slice129 +case0026_slice130 +case0039_slice000 +case0039_slice001 +case0039_slice002 +case0039_slice003 +case0039_slice004 +case0039_slice005 +case0039_slice006 +case0039_slice007 +case0039_slice008 +case0039_slice009 +case0039_slice010 +case0039_slice011 +case0039_slice012 +case0039_slice013 +case0039_slice014 +case0039_slice015 +case0039_slice016 +case0039_slice017 +case0039_slice018 +case0039_slice019 +case0039_slice020 +case0039_slice021 +case0039_slice022 +case0039_slice023 +case0039_slice024 +case0039_slice025 +case0039_slice026 +case0039_slice027 +case0039_slice028 +case0039_slice029 +case0039_slice030 +case0039_slice031 +case0039_slice032 +case0039_slice033 +case0039_slice034 +case0039_slice035 +case0039_slice036 +case0039_slice037 +case0039_slice038 +case0039_slice039 +case0039_slice040 +case0039_slice041 +case0039_slice042 +case0039_slice043 +case0039_slice044 +case0039_slice045 +case0039_slice046 +case0039_slice047 +case0039_slice048 +case0039_slice049 +case0039_slice050 +case0039_slice051 +case0039_slice052 +case0039_slice053 +case0039_slice054 +case0039_slice055 +case0039_slice056 +case0039_slice057 +case0039_slice058 +case0039_slice059 +case0039_slice060 +case0039_slice061 +case0039_slice062 +case0039_slice063 +case0039_slice064 +case0039_slice065 +case0039_slice066 +case0039_slice067 +case0039_slice068 +case0039_slice069 +case0039_slice070 +case0039_slice071 +case0039_slice072 +case0039_slice073 +case0039_slice074 +case0039_slice075 +case0039_slice076 +case0039_slice077 +case0039_slice078 +case0039_slice079 +case0039_slice080 +case0039_slice081 +case0039_slice082 +case0039_slice083 +case0039_slice084 +case0039_slice085 +case0039_slice086 +case0039_slice087 +case0039_slice088 +case0039_slice089 +case0024_slice000 +case0024_slice001 +case0024_slice002 +case0024_slice003 +case0024_slice004 +case0024_slice005 +case0024_slice006 +case0024_slice007 +case0024_slice008 +case0024_slice009 +case0024_slice010 +case0024_slice011 +case0024_slice012 +case0024_slice013 +case0024_slice014 +case0024_slice015 +case0024_slice016 +case0024_slice017 +case0024_slice018 +case0024_slice019 +case0024_slice020 +case0024_slice021 +case0024_slice022 +case0024_slice023 +case0024_slice024 +case0024_slice025 +case0024_slice026 +case0024_slice027 +case0024_slice028 +case0024_slice029 +case0024_slice030 +case0024_slice031 +case0024_slice032 +case0024_slice033 +case0024_slice034 +case0024_slice035 +case0024_slice036 +case0024_slice037 +case0024_slice038 +case0024_slice039 +case0024_slice040 +case0024_slice041 +case0024_slice042 +case0024_slice043 +case0024_slice044 +case0024_slice045 +case0024_slice046 +case0024_slice047 +case0024_slice048 +case0024_slice049 +case0024_slice050 +case0024_slice051 +case0024_slice052 +case0024_slice053 +case0024_slice054 +case0024_slice055 +case0024_slice056 +case0024_slice057 +case0024_slice058 +case0024_slice059 +case0024_slice060 +case0024_slice061 +case0024_slice062 +case0024_slice063 +case0024_slice064 +case0024_slice065 +case0024_slice066 +case0024_slice067 +case0024_slice068 +case0024_slice069 +case0024_slice070 +case0024_slice071 +case0024_slice072 +case0024_slice073 +case0024_slice074 +case0024_slice075 +case0024_slice076 +case0024_slice077 +case0024_slice078 +case0024_slice079 +case0024_slice080 +case0024_slice081 +case0024_slice082 +case0024_slice083 +case0024_slice084 +case0024_slice085 +case0024_slice086 +case0024_slice087 +case0024_slice088 +case0024_slice089 +case0024_slice090 +case0024_slice091 +case0024_slice092 +case0024_slice093 +case0024_slice094 +case0024_slice095 +case0024_slice096 +case0024_slice097 +case0024_slice098 +case0024_slice099 +case0024_slice100 +case0024_slice101 +case0024_slice102 +case0024_slice103 +case0024_slice104 +case0024_slice105 +case0024_slice106 +case0024_slice107 +case0024_slice108 +case0024_slice109 +case0024_slice110 +case0024_slice111 +case0024_slice112 +case0024_slice113 +case0024_slice114 +case0024_slice115 +case0024_slice116 +case0024_slice117 +case0024_slice118 +case0024_slice119 +case0024_slice120 +case0024_slice121 +case0024_slice122 +case0024_slice123 +case0034_slice000 +case0034_slice001 +case0034_slice002 +case0034_slice003 +case0034_slice004 +case0034_slice005 +case0034_slice006 +case0034_slice007 +case0034_slice008 +case0034_slice009 +case0034_slice010 +case0034_slice011 +case0034_slice012 +case0034_slice013 +case0034_slice014 +case0034_slice015 +case0034_slice016 +case0034_slice017 +case0034_slice018 +case0034_slice019 +case0034_slice020 +case0034_slice021 +case0034_slice022 +case0034_slice023 +case0034_slice024 +case0034_slice025 +case0034_slice026 +case0034_slice027 +case0034_slice028 +case0034_slice029 +case0034_slice030 +case0034_slice031 +case0034_slice032 +case0034_slice033 +case0034_slice034 +case0034_slice035 +case0034_slice036 +case0034_slice037 +case0034_slice038 +case0034_slice039 +case0034_slice040 +case0034_slice041 +case0034_slice042 +case0034_slice043 +case0034_slice044 +case0034_slice045 +case0034_slice046 +case0034_slice047 +case0034_slice048 +case0034_slice049 +case0034_slice050 +case0034_slice051 +case0034_slice052 +case0034_slice053 +case0034_slice054 +case0034_slice055 +case0034_slice056 +case0034_slice057 +case0034_slice058 +case0034_slice059 +case0034_slice060 +case0034_slice061 +case0034_slice062 +case0034_slice063 +case0034_slice064 +case0034_slice065 +case0034_slice066 +case0034_slice067 +case0034_slice068 +case0034_slice069 +case0034_slice070 +case0034_slice071 +case0034_slice072 +case0034_slice073 +case0034_slice074 +case0034_slice075 +case0034_slice076 +case0034_slice077 +case0034_slice078 +case0034_slice079 +case0034_slice080 +case0034_slice081 +case0034_slice082 +case0034_slice083 +case0034_slice084 +case0034_slice085 +case0034_slice086 +case0034_slice087 +case0034_slice088 +case0034_slice089 +case0034_slice090 +case0034_slice091 +case0034_slice092 +case0034_slice093 +case0034_slice094 +case0034_slice095 +case0034_slice096 +case0034_slice097 +case0033_slice000 +case0033_slice001 +case0033_slice002 +case0033_slice003 +case0033_slice004 +case0033_slice005 +case0033_slice006 +case0033_slice007 +case0033_slice008 +case0033_slice009 +case0033_slice010 +case0033_slice011 +case0033_slice012 +case0033_slice013 +case0033_slice014 +case0033_slice015 +case0033_slice016 +case0033_slice017 +case0033_slice018 +case0033_slice019 +case0033_slice020 +case0033_slice021 +case0033_slice022 +case0033_slice023 +case0033_slice024 +case0033_slice025 +case0033_slice026 +case0033_slice027 +case0033_slice028 +case0033_slice029 +case0033_slice030 +case0033_slice031 +case0033_slice032 +case0033_slice033 +case0033_slice034 +case0033_slice035 +case0033_slice036 +case0033_slice037 +case0033_slice038 +case0033_slice039 +case0033_slice040 +case0033_slice041 +case0033_slice042 +case0033_slice043 +case0033_slice044 +case0033_slice045 +case0033_slice046 +case0033_slice047 +case0033_slice048 +case0033_slice049 +case0033_slice050 +case0033_slice051 +case0033_slice052 +case0033_slice053 +case0033_slice054 +case0033_slice055 +case0033_slice056 +case0033_slice057 +case0033_slice058 +case0033_slice059 +case0033_slice060 +case0033_slice061 +case0033_slice062 +case0033_slice063 +case0033_slice064 +case0033_slice065 +case0033_slice066 +case0033_slice067 +case0033_slice068 +case0033_slice069 +case0033_slice070 +case0033_slice071 +case0033_slice072 +case0033_slice073 +case0033_slice074 +case0033_slice075 +case0033_slice076 +case0033_slice077 +case0033_slice078 +case0033_slice079 +case0033_slice080 +case0033_slice081 +case0033_slice082 +case0033_slice083 +case0033_slice084 +case0033_slice085 +case0033_slice086 +case0033_slice087 +case0033_slice088 +case0033_slice089 +case0033_slice090 +case0033_slice091 +case0033_slice092 +case0033_slice093 +case0033_slice094 +case0033_slice095 +case0033_slice096 +case0033_slice097 +case0033_slice098 +case0033_slice099 +case0033_slice100 +case0033_slice101 +case0033_slice102 +case0033_slice103 +case0030_slice000 +case0030_slice001 +case0030_slice002 +case0030_slice003 +case0030_slice004 +case0030_slice005 +case0030_slice006 +case0030_slice007 +case0030_slice008 +case0030_slice009 +case0030_slice010 +case0030_slice011 +case0030_slice012 +case0030_slice013 +case0030_slice014 +case0030_slice015 +case0030_slice016 +case0030_slice017 +case0030_slice018 +case0030_slice019 +case0030_slice020 +case0030_slice021 +case0030_slice022 +case0030_slice023 +case0030_slice024 +case0030_slice025 +case0030_slice026 +case0030_slice027 +case0030_slice028 +case0030_slice029 +case0030_slice030 +case0030_slice031 +case0030_slice032 +case0030_slice033 +case0030_slice034 +case0030_slice035 +case0030_slice036 +case0030_slice037 +case0030_slice038 +case0030_slice039 +case0030_slice040 +case0030_slice041 +case0030_slice042 +case0030_slice043 +case0030_slice044 +case0030_slice045 +case0030_slice046 +case0030_slice047 +case0030_slice048 +case0030_slice049 +case0030_slice050 +case0030_slice051 +case0030_slice052 +case0030_slice053 +case0030_slice054 +case0030_slice055 +case0030_slice056 +case0030_slice057 +case0030_slice058 +case0030_slice059 +case0030_slice060 +case0030_slice061 +case0030_slice062 +case0030_slice063 +case0030_slice064 +case0030_slice065 +case0030_slice066 +case0030_slice067 +case0030_slice068 +case0030_slice069 +case0030_slice070 +case0030_slice071 +case0030_slice072 +case0030_slice073 +case0030_slice074 +case0030_slice075 +case0030_slice076 +case0030_slice077 +case0030_slice078 +case0030_slice079 +case0030_slice080 +case0030_slice081 +case0030_slice082 +case0030_slice083 +case0030_slice084 +case0030_slice085 +case0030_slice086 +case0030_slice087 +case0030_slice088 +case0030_slice089 +case0030_slice090 +case0030_slice091 +case0030_slice092 +case0030_slice093 +case0030_slice094 +case0030_slice095 +case0030_slice096 +case0030_slice097 +case0030_slice098 +case0030_slice099 +case0030_slice100 +case0030_slice101 +case0030_slice102 +case0030_slice103 +case0030_slice104 +case0030_slice105 +case0030_slice106 +case0030_slice107 +case0030_slice108 +case0030_slice109 +case0030_slice110 +case0030_slice111 +case0030_slice112 +case0030_slice113 +case0030_slice114 +case0030_slice115 +case0030_slice116 +case0030_slice117 +case0030_slice118 +case0030_slice119 +case0030_slice120 +case0030_slice121 +case0030_slice122 +case0030_slice123 +case0030_slice124 +case0030_slice125 +case0030_slice126 +case0030_slice127 +case0030_slice128 +case0030_slice129 +case0030_slice130 +case0030_slice131 +case0030_slice132 +case0030_slice133 +case0030_slice134 +case0030_slice135 +case0030_slice136 +case0030_slice137 +case0030_slice138 +case0030_slice139 +case0030_slice140 +case0030_slice141 +case0030_slice142 +case0030_slice143 +case0030_slice144 +case0030_slice145 +case0030_slice146 +case0030_slice147 +case0030_slice148 +case0030_slice149 +case0030_slice150 +case0030_slice151 +case0030_slice152 +case0023_slice000 +case0023_slice001 +case0023_slice002 +case0023_slice003 +case0023_slice004 +case0023_slice005 +case0023_slice006 +case0023_slice007 +case0023_slice008 +case0023_slice009 +case0023_slice010 +case0023_slice011 +case0023_slice012 +case0023_slice013 +case0023_slice014 +case0023_slice015 +case0023_slice016 +case0023_slice017 +case0023_slice018 +case0023_slice019 +case0023_slice020 +case0023_slice021 +case0023_slice022 +case0023_slice023 +case0023_slice024 +case0023_slice025 +case0023_slice026 +case0023_slice027 +case0023_slice028 +case0023_slice029 +case0023_slice030 +case0023_slice031 +case0023_slice032 +case0023_slice033 +case0023_slice034 +case0023_slice035 +case0023_slice036 +case0023_slice037 +case0023_slice038 +case0023_slice039 +case0023_slice040 +case0023_slice041 +case0023_slice042 +case0023_slice043 +case0023_slice044 +case0023_slice045 +case0023_slice046 +case0023_slice047 +case0023_slice048 +case0023_slice049 +case0023_slice050 +case0023_slice051 +case0023_slice052 +case0023_slice053 +case0023_slice054 +case0023_slice055 +case0023_slice056 +case0023_slice057 +case0023_slice058 +case0023_slice059 +case0023_slice060 +case0023_slice061 +case0023_slice062 +case0023_slice063 +case0023_slice064 +case0023_slice065 +case0023_slice066 +case0023_slice067 +case0023_slice068 +case0023_slice069 +case0023_slice070 +case0023_slice071 +case0023_slice072 +case0023_slice073 +case0023_slice074 +case0023_slice075 +case0023_slice076 +case0023_slice077 +case0023_slice078 +case0023_slice079 +case0023_slice080 +case0023_slice081 +case0023_slice082 +case0023_slice083 +case0023_slice084 +case0023_slice085 +case0023_slice086 +case0023_slice087 +case0023_slice088 +case0023_slice089 +case0023_slice090 +case0023_slice091 +case0023_slice092 +case0023_slice093 +case0023_slice094 +case0023_slice095 +case0040_slice000 +case0040_slice001 +case0040_slice002 +case0040_slice003 +case0040_slice004 +case0040_slice005 +case0040_slice006 +case0040_slice007 +case0040_slice008 +case0040_slice009 +case0040_slice010 +case0040_slice011 +case0040_slice012 +case0040_slice013 +case0040_slice014 +case0040_slice015 +case0040_slice016 +case0040_slice017 +case0040_slice018 +case0040_slice019 +case0040_slice020 +case0040_slice021 +case0040_slice022 +case0040_slice023 +case0040_slice024 +case0040_slice025 +case0040_slice026 +case0040_slice027 +case0040_slice028 +case0040_slice029 +case0040_slice030 +case0040_slice031 +case0040_slice032 +case0040_slice033 +case0040_slice034 +case0040_slice035 +case0040_slice036 +case0040_slice037 +case0040_slice038 +case0040_slice039 +case0040_slice040 +case0040_slice041 +case0040_slice042 +case0040_slice043 +case0040_slice044 +case0040_slice045 +case0040_slice046 +case0040_slice047 +case0040_slice048 +case0040_slice049 +case0040_slice050 +case0040_slice051 +case0040_slice052 +case0040_slice053 +case0040_slice054 +case0040_slice055 +case0040_slice056 +case0040_slice057 +case0040_slice058 +case0040_slice059 +case0040_slice060 +case0040_slice061 +case0040_slice062 +case0040_slice063 +case0040_slice064 +case0040_slice065 +case0040_slice066 +case0040_slice067 +case0040_slice068 +case0040_slice069 +case0040_slice070 +case0040_slice071 +case0040_slice072 +case0040_slice073 +case0040_slice074 +case0040_slice075 +case0040_slice076 +case0040_slice077 +case0040_slice078 +case0040_slice079 +case0040_slice080 +case0040_slice081 +case0040_slice082 +case0040_slice083 +case0040_slice084 +case0040_slice085 +case0040_slice086 +case0040_slice087 +case0040_slice088 +case0040_slice089 +case0040_slice090 +case0040_slice091 +case0040_slice092 +case0040_slice093 +case0040_slice094 +case0040_slice095 +case0040_slice096 +case0040_slice097 +case0040_slice098 +case0040_slice099 +case0040_slice100 +case0040_slice101 +case0040_slice102 +case0040_slice103 +case0040_slice104 +case0040_slice105 +case0040_slice106 +case0040_slice107 +case0040_slice108 +case0040_slice109 +case0040_slice110 +case0040_slice111 +case0040_slice112 +case0040_slice113 +case0040_slice114 +case0040_slice115 +case0040_slice116 +case0040_slice117 +case0040_slice118 +case0040_slice119 +case0040_slice120 +case0040_slice121 +case0040_slice122 +case0040_slice123 +case0040_slice124 +case0040_slice125 +case0040_slice126 +case0040_slice127 +case0040_slice128 +case0040_slice129 +case0040_slice130 +case0040_slice131 +case0040_slice132 +case0040_slice133 +case0040_slice134 +case0040_slice135 +case0040_slice136 +case0040_slice137 +case0040_slice138 +case0040_slice139 +case0040_slice140 +case0040_slice141 +case0040_slice142 +case0040_slice143 +case0040_slice144 +case0040_slice145 +case0040_slice146 +case0040_slice147 +case0040_slice148 +case0040_slice149 +case0040_slice150 +case0040_slice151 +case0040_slice152 +case0040_slice153 +case0040_slice154 +case0040_slice155 +case0040_slice156 +case0040_slice157 +case0040_slice158 +case0040_slice159 +case0040_slice160 +case0040_slice161 +case0040_slice162 +case0040_slice163 +case0040_slice164 +case0040_slice165 +case0040_slice166 +case0040_slice167 +case0040_slice168 +case0040_slice169 +case0040_slice170 +case0040_slice171 +case0040_slice172 +case0040_slice173 +case0040_slice174 +case0040_slice175 +case0040_slice176 +case0040_slice177 +case0040_slice178 +case0040_slice179 +case0040_slice180 +case0040_slice181 +case0040_slice182 +case0040_slice183 +case0040_slice184 +case0040_slice185 +case0040_slice186 +case0040_slice187 +case0040_slice188 +case0040_slice189 +case0040_slice190 +case0040_slice191 +case0040_slice192 +case0040_slice193 +case0040_slice194 +case0010_slice000 +case0010_slice001 +case0010_slice002 +case0010_slice003 +case0010_slice004 +case0010_slice005 +case0010_slice006 +case0010_slice007 +case0010_slice008 +case0010_slice009 +case0010_slice010 +case0010_slice011 +case0010_slice012 +case0010_slice013 +case0010_slice014 +case0010_slice015 +case0010_slice016 +case0010_slice017 +case0010_slice018 +case0010_slice019 +case0010_slice020 +case0010_slice021 +case0010_slice022 +case0010_slice023 +case0010_slice024 +case0010_slice025 +case0010_slice026 +case0010_slice027 +case0010_slice028 +case0010_slice029 +case0010_slice030 +case0010_slice031 +case0010_slice032 +case0010_slice033 +case0010_slice034 +case0010_slice035 +case0010_slice036 +case0010_slice037 +case0010_slice038 +case0010_slice039 +case0010_slice040 +case0010_slice041 +case0010_slice042 +case0010_slice043 +case0010_slice044 +case0010_slice045 +case0010_slice046 +case0010_slice047 +case0010_slice048 +case0010_slice049 +case0010_slice050 +case0010_slice051 +case0010_slice052 +case0010_slice053 +case0010_slice054 +case0010_slice055 +case0010_slice056 +case0010_slice057 +case0010_slice058 +case0010_slice059 +case0010_slice060 +case0010_slice061 +case0010_slice062 +case0010_slice063 +case0010_slice064 +case0010_slice065 +case0010_slice066 +case0010_slice067 +case0010_slice068 +case0010_slice069 +case0010_slice070 +case0010_slice071 +case0010_slice072 +case0010_slice073 +case0010_slice074 +case0010_slice075 +case0010_slice076 +case0010_slice077 +case0010_slice078 +case0010_slice079 +case0010_slice080 +case0010_slice081 +case0010_slice082 +case0010_slice083 +case0010_slice084 +case0010_slice085 +case0010_slice086 +case0010_slice087 +case0010_slice088 +case0010_slice089 +case0010_slice090 +case0010_slice091 +case0010_slice092 +case0010_slice093 +case0010_slice094 +case0010_slice095 +case0010_slice096 +case0010_slice097 +case0010_slice098 +case0010_slice099 +case0010_slice100 +case0010_slice101 +case0010_slice102 +case0010_slice103 +case0010_slice104 +case0010_slice105 +case0010_slice106 +case0010_slice107 +case0010_slice108 +case0010_slice109 +case0010_slice110 +case0010_slice111 +case0010_slice112 +case0010_slice113 +case0010_slice114 +case0010_slice115 +case0010_slice116 +case0010_slice117 +case0010_slice118 +case0010_slice119 +case0010_slice120 +case0010_slice121 +case0010_slice122 +case0010_slice123 +case0010_slice124 +case0010_slice125 +case0010_slice126 +case0010_slice127 +case0010_slice128 +case0010_slice129 +case0010_slice130 +case0010_slice131 +case0010_slice132 +case0010_slice133 +case0010_slice134 +case0010_slice135 +case0010_slice136 +case0010_slice137 +case0010_slice138 +case0010_slice139 +case0010_slice140 +case0010_slice141 +case0010_slice142 +case0010_slice143 +case0010_slice144 +case0010_slice145 +case0010_slice146 +case0010_slice147 +case0021_slice000 +case0021_slice001 +case0021_slice002 +case0021_slice003 +case0021_slice004 +case0021_slice005 +case0021_slice006 +case0021_slice007 +case0021_slice008 +case0021_slice009 +case0021_slice010 +case0021_slice011 +case0021_slice012 +case0021_slice013 +case0021_slice014 +case0021_slice015 +case0021_slice016 +case0021_slice017 +case0021_slice018 +case0021_slice019 +case0021_slice020 +case0021_slice021 +case0021_slice022 +case0021_slice023 +case0021_slice024 +case0021_slice025 +case0021_slice026 +case0021_slice027 +case0021_slice028 +case0021_slice029 +case0021_slice030 +case0021_slice031 +case0021_slice032 +case0021_slice033 +case0021_slice034 +case0021_slice035 +case0021_slice036 +case0021_slice037 +case0021_slice038 +case0021_slice039 +case0021_slice040 +case0021_slice041 +case0021_slice042 +case0021_slice043 +case0021_slice044 +case0021_slice045 +case0021_slice046 +case0021_slice047 +case0021_slice048 +case0021_slice049 +case0021_slice050 +case0021_slice051 +case0021_slice052 +case0021_slice053 +case0021_slice054 +case0021_slice055 +case0021_slice056 +case0021_slice057 +case0021_slice058 +case0021_slice059 +case0021_slice060 +case0021_slice061 +case0021_slice062 +case0021_slice063 +case0021_slice064 +case0021_slice065 +case0021_slice066 +case0021_slice067 +case0021_slice068 +case0021_slice069 +case0021_slice070 +case0021_slice071 +case0021_slice072 +case0021_slice073 +case0021_slice074 +case0021_slice075 +case0021_slice076 +case0021_slice077 +case0021_slice078 +case0021_slice079 +case0021_slice080 +case0021_slice081 +case0021_slice082 +case0021_slice083 +case0021_slice084 +case0021_slice085 +case0021_slice086 +case0021_slice087 +case0021_slice088 +case0021_slice089 +case0021_slice090 +case0021_slice091 +case0021_slice092 +case0021_slice093 +case0021_slice094 +case0021_slice095 +case0021_slice096 +case0021_slice097 +case0021_slice098 +case0021_slice099 +case0021_slice100 +case0021_slice101 +case0021_slice102 +case0021_slice103 +case0021_slice104 +case0021_slice105 +case0021_slice106 +case0021_slice107 +case0021_slice108 +case0021_slice109 +case0021_slice110 +case0021_slice111 +case0021_slice112 +case0021_slice113 +case0021_slice114 +case0021_slice115 +case0021_slice116 +case0021_slice117 +case0021_slice118 +case0021_slice119 +case0021_slice120 +case0021_slice121 +case0021_slice122 +case0021_slice123 +case0021_slice124 +case0021_slice125 +case0021_slice126 +case0021_slice127 +case0021_slice128 +case0021_slice129 +case0021_slice130 +case0021_slice131 +case0021_slice132 +case0021_slice133 +case0021_slice134 +case0021_slice135 +case0021_slice136 +case0021_slice137 +case0021_slice138 +case0021_slice139 +case0021_slice140 +case0021_slice141 +case0021_slice142 +case0006_slice000 +case0006_slice001 +case0006_slice002 +case0006_slice003 +case0006_slice004 +case0006_slice005 +case0006_slice006 +case0006_slice007 +case0006_slice008 +case0006_slice009 +case0006_slice010 +case0006_slice011 +case0006_slice012 +case0006_slice013 +case0006_slice014 +case0006_slice015 +case0006_slice016 +case0006_slice017 +case0006_slice018 +case0006_slice019 +case0006_slice020 +case0006_slice021 +case0006_slice022 +case0006_slice023 +case0006_slice024 +case0006_slice025 +case0006_slice026 +case0006_slice027 +case0006_slice028 +case0006_slice029 +case0006_slice030 +case0006_slice031 +case0006_slice032 +case0006_slice033 +case0006_slice034 +case0006_slice035 +case0006_slice036 +case0006_slice037 +case0006_slice038 +case0006_slice039 +case0006_slice040 +case0006_slice041 +case0006_slice042 +case0006_slice043 +case0006_slice044 +case0006_slice045 +case0006_slice046 +case0006_slice047 +case0006_slice048 +case0006_slice049 +case0006_slice050 +case0006_slice051 +case0006_slice052 +case0006_slice053 +case0006_slice054 +case0006_slice055 +case0006_slice056 +case0006_slice057 +case0006_slice058 +case0006_slice059 +case0006_slice060 +case0006_slice061 +case0006_slice062 +case0006_slice063 +case0006_slice064 +case0006_slice065 +case0006_slice066 +case0006_slice067 +case0006_slice068 +case0006_slice069 +case0006_slice070 +case0006_slice071 +case0006_slice072 +case0006_slice073 +case0006_slice074 +case0006_slice075 +case0006_slice076 +case0006_slice077 +case0006_slice078 +case0006_slice079 +case0006_slice080 +case0006_slice081 +case0006_slice082 +case0006_slice083 +case0006_slice084 +case0006_slice085 +case0006_slice086 +case0006_slice087 +case0006_slice088 +case0006_slice089 +case0006_slice090 +case0006_slice091 +case0006_slice092 +case0006_slice093 +case0006_slice094 +case0006_slice095 +case0006_slice096 +case0006_slice097 +case0006_slice098 +case0006_slice099 +case0006_slice100 +case0006_slice101 +case0006_slice102 +case0006_slice103 +case0006_slice104 +case0006_slice105 +case0006_slice106 +case0006_slice107 +case0006_slice108 +case0006_slice109 +case0006_slice110 +case0006_slice111 +case0006_slice112 +case0006_slice113 +case0006_slice114 +case0006_slice115 +case0006_slice116 +case0006_slice117 +case0006_slice118 +case0006_slice119 +case0006_slice120 +case0006_slice121 +case0006_slice122 +case0006_slice123 +case0006_slice124 +case0006_slice125 +case0006_slice126 +case0006_slice127 +case0006_slice128 +case0006_slice129 +case0006_slice130 +case0027_slice000 +case0027_slice001 +case0027_slice002 +case0027_slice003 +case0027_slice004 +case0027_slice005 +case0027_slice006 +case0027_slice007 +case0027_slice008 +case0027_slice009 +case0027_slice010 +case0027_slice011 +case0027_slice012 +case0027_slice013 +case0027_slice014 +case0027_slice015 +case0027_slice016 +case0027_slice017 +case0027_slice018 +case0027_slice019 +case0027_slice020 +case0027_slice021 +case0027_slice022 +case0027_slice023 +case0027_slice024 +case0027_slice025 +case0027_slice026 +case0027_slice027 +case0027_slice028 +case0027_slice029 +case0027_slice030 +case0027_slice031 +case0027_slice032 +case0027_slice033 +case0027_slice034 +case0027_slice035 +case0027_slice036 +case0027_slice037 +case0027_slice038 +case0027_slice039 +case0027_slice040 +case0027_slice041 +case0027_slice042 +case0027_slice043 +case0027_slice044 +case0027_slice045 +case0027_slice046 +case0027_slice047 +case0027_slice048 +case0027_slice049 +case0027_slice050 +case0027_slice051 +case0027_slice052 +case0027_slice053 +case0027_slice054 +case0027_slice055 +case0027_slice056 +case0027_slice057 +case0027_slice058 +case0027_slice059 +case0027_slice060 +case0027_slice061 +case0027_slice062 +case0027_slice063 +case0027_slice064 +case0027_slice065 +case0027_slice066 +case0027_slice067 +case0027_slice068 +case0027_slice069 +case0027_slice070 +case0027_slice071 +case0027_slice072 +case0027_slice073 +case0027_slice074 +case0027_slice075 +case0027_slice076 +case0027_slice077 +case0027_slice078 +case0027_slice079 +case0027_slice080 +case0027_slice081 +case0027_slice082 +case0027_slice083 +case0027_slice084 +case0027_slice085 +case0027_slice086 +case0027_slice087 +case0028_slice000 +case0028_slice001 +case0028_slice002 +case0028_slice003 +case0028_slice004 +case0028_slice005 +case0028_slice006 +case0028_slice007 +case0028_slice008 +case0028_slice009 +case0028_slice010 +case0028_slice011 +case0028_slice012 +case0028_slice013 +case0028_slice014 +case0028_slice015 +case0028_slice016 +case0028_slice017 +case0028_slice018 +case0028_slice019 +case0028_slice020 +case0028_slice021 +case0028_slice022 +case0028_slice023 +case0028_slice024 +case0028_slice025 +case0028_slice026 +case0028_slice027 +case0028_slice028 +case0028_slice029 +case0028_slice030 +case0028_slice031 +case0028_slice032 +case0028_slice033 +case0028_slice034 +case0028_slice035 +case0028_slice036 +case0028_slice037 +case0028_slice038 +case0028_slice039 +case0028_slice040 +case0028_slice041 +case0028_slice042 +case0028_slice043 +case0028_slice044 +case0028_slice045 +case0028_slice046 +case0028_slice047 +case0028_slice048 +case0028_slice049 +case0028_slice050 +case0028_slice051 +case0028_slice052 +case0028_slice053 +case0028_slice054 +case0028_slice055 +case0028_slice056 +case0028_slice057 +case0028_slice058 +case0028_slice059 +case0028_slice060 +case0028_slice061 +case0028_slice062 +case0028_slice063 +case0028_slice064 +case0028_slice065 +case0028_slice066 +case0028_slice067 +case0028_slice068 +case0028_slice069 +case0028_slice070 +case0028_slice071 +case0028_slice072 +case0028_slice073 +case0028_slice074 +case0028_slice075 +case0028_slice076 +case0028_slice077 +case0028_slice078 +case0028_slice079 +case0028_slice080 +case0028_slice081 +case0028_slice082 +case0028_slice083 +case0028_slice084 +case0028_slice085 +case0028_slice086 +case0028_slice087 +case0028_slice088 +case0037_slice000 +case0037_slice001 +case0037_slice002 +case0037_slice003 +case0037_slice004 +case0037_slice005 +case0037_slice006 +case0037_slice007 +case0037_slice008 +case0037_slice009 +case0037_slice010 +case0037_slice011 +case0037_slice012 +case0037_slice013 +case0037_slice014 +case0037_slice015 +case0037_slice016 +case0037_slice017 +case0037_slice018 +case0037_slice019 +case0037_slice020 +case0037_slice021 +case0037_slice022 +case0037_slice023 +case0037_slice024 +case0037_slice025 +case0037_slice026 +case0037_slice027 +case0037_slice028 +case0037_slice029 +case0037_slice030 +case0037_slice031 +case0037_slice032 +case0037_slice033 +case0037_slice034 +case0037_slice035 +case0037_slice036 +case0037_slice037 +case0037_slice038 +case0037_slice039 +case0037_slice040 +case0037_slice041 +case0037_slice042 +case0037_slice043 +case0037_slice044 +case0037_slice045 +case0037_slice046 +case0037_slice047 +case0037_slice048 +case0037_slice049 +case0037_slice050 +case0037_slice051 +case0037_slice052 +case0037_slice053 +case0037_slice054 +case0037_slice055 +case0037_slice056 +case0037_slice057 +case0037_slice058 +case0037_slice059 +case0037_slice060 +case0037_slice061 +case0037_slice062 +case0037_slice063 +case0037_slice064 +case0037_slice065 +case0037_slice066 +case0037_slice067 +case0037_slice068 +case0037_slice069 +case0037_slice070 +case0037_slice071 +case0037_slice072 +case0037_slice073 +case0037_slice074 +case0037_slice075 +case0037_slice076 +case0037_slice077 +case0037_slice078 +case0037_slice079 +case0037_slice080 +case0037_slice081 +case0037_slice082 +case0037_slice083 +case0037_slice084 +case0037_slice085 +case0037_slice086 +case0037_slice087 +case0037_slice088 +case0037_slice089 +case0037_slice090 +case0037_slice091 +case0037_slice092 +case0037_slice093 +case0037_slice094 +case0037_slice095 +case0037_slice096 +case0037_slice097 +case0037_slice098 diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/networks/swin_transformer_unet_skip_expand_decoder_sys.py b/PuzzleTuning/SSL_structures/Swin_Unet_main/networks/swin_transformer_unet_skip_expand_decoder_sys.py new file mode 100644 index 0000000000000000000000000000000000000000..dec2aeff5ea846ba112c70ebc8228171384f3c40 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/networks/swin_transformer_unet_skip_expand_decoder_sys.py @@ -0,0 +1,753 @@ +import torch +import torch.nn as nn +import torch.utils.checkpoint as checkpoint +from einops import rearrange +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + r""" Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """ + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + def extra_repr(self) -> str: + return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}' + + def flops(self, N): + # calculate flops for 1 window with token length of N + flops = 0 + # qkv = self.qkv(x) + flops += N * self.dim * 3 * self.dim + # attn = (q @ k.transpose(-2, -1)) + flops += self.num_heads * N * (self.dim // self.num_heads) * N + # x = (attn @ v) + flops += self.num_heads * N * N * (self.dim // self.num_heads) + # x = self.proj(x) + flops += N * self.dim * self.dim + return flops + + +class SwinTransformerBlock(nn.Module): + r""" Swin Transformer Block. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resulotion. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., + act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + if min(self.input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = 0 + self.window_size = min(self.input_resolution) + assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, + qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + if self.shift_size > 0: + # calculate attention mask for SW-MSA + H, W = self.input_resolution + img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1 + h_slices = (slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)) + w_slices = (slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) + else: + attn_mask = None + + self.register_buffer("attn_mask", attn_mask) + + def forward(self, x): + H, W = self.input_resolution + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + else: + shifted_x = x + + # partition windows + x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C + x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, mask=self.attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) + else: + x = shifted_x + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \ + f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}" + + def flops(self): + flops = 0 + H, W = self.input_resolution + # norm1 + flops += self.dim * H * W + # W-MSA/SW-MSA + nW = H * W / self.window_size / self.window_size + flops += nW * self.attn.flops(self.window_size * self.window_size) + # mlp + flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio + # norm2 + flops += self.dim * H * W + return flops + + +class PatchMerging(nn.Module): + r""" Patch Merging Layer. + + Args: + input_resolution (tuple[int]): Resolution of input feature. + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x): + """ + x: B, H*W, C + """ + H, W = self.input_resolution + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even." + + x = x.view(B, H, W, C) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + def extra_repr(self) -> str: + return f"input_resolution={self.input_resolution}, dim={self.dim}" + + def flops(self): + H, W = self.input_resolution + flops = H * W * self.dim + flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim + return flops + +class PatchExpand(nn.Module): + def __init__(self, input_resolution, dim, dim_scale=2, norm_layer=nn.LayerNorm): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.expand = nn.Linear(dim, 2*dim, bias=False) if dim_scale==2 else nn.Identity() + self.norm = norm_layer(dim // dim_scale) + + def forward(self, x): + """ + x: B, H*W, C + """ + H, W = self.input_resolution + x = self.expand(x) + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + + x = x.view(B, H, W, C) + x = rearrange(x, 'b h w (p1 p2 c)-> b (h p1) (w p2) c', p1=2, p2=2, c=C//4) + x = x.view(B,-1,C//4) + x= self.norm(x) + + return x + +class FinalPatchExpand_X4(nn.Module): + def __init__(self, input_resolution, dim, dim_scale=4, norm_layer=nn.LayerNorm): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.dim_scale = dim_scale + self.expand = nn.Linear(dim, 16*dim, bias=False) + self.output_dim = dim + self.norm = norm_layer(self.output_dim) + + def forward(self, x): + """ + x: B, H*W, C + """ + H, W = self.input_resolution + x = self.expand(x) + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + + x = x.view(B, H, W, C) + x = rearrange(x, 'b h w (p1 p2 c)-> b (h p1) (w p2) c', p1=self.dim_scale, p2=self.dim_scale, c=C//(self.dim_scale**2)) + x = x.view(B,-1,self.output_dim) + x= self.norm(x) + + return x + +class BasicLayer(nn.Module): + """ A basic Swin Transformer layer for one stage. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__(self, dim, input_resolution, depth, num_heads, window_size, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False): + + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList([ + SwinTransformerBlock(dim=dim, input_resolution=input_resolution, + num_heads=num_heads, window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop, attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer) + for i in range(depth)]) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x): + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + if self.downsample is not None: + x = self.downsample(x) + return x + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" + + def flops(self): + flops = 0 + for blk in self.blocks: + flops += blk.flops() + if self.downsample is not None: + flops += self.downsample.flops() + return flops + +class BasicLayer_up(nn.Module): + """ A basic Swin Transformer layer for one stage. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__(self, dim, input_resolution, depth, num_heads, window_size, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., norm_layer=nn.LayerNorm, upsample=None, use_checkpoint=False): + + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList([ + SwinTransformerBlock(dim=dim, input_resolution=input_resolution, + num_heads=num_heads, window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop, attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer) + for i in range(depth)]) + + # patch merging layer + if upsample is not None: + self.upsample = PatchExpand(input_resolution, dim=dim, dim_scale=2, norm_layer=norm_layer) + else: + self.upsample = None + + def forward(self, x): + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + if self.upsample is not None: + x = self.upsample(x) + return x + +class PatchEmbed(nn.Module): + r""" Image to Patch Embedding + + Args: + img_size (int): Image size. Default: 224. + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]] + self.img_size = img_size + self.patch_size = patch_size + self.patches_resolution = patches_resolution + self.num_patches = patches_resolution[0] * patches_resolution[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C + if self.norm is not None: + x = self.norm(x) + return x + + def flops(self): + Ho, Wo = self.patches_resolution + flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops + + +class SwinTransformerSys(nn.Module): + r""" Swin Transformer + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + + Args: + img_size (int | tuple(int)): Input image size. Default 224 + patch_size (int | tuple(int)): Patch size. Default: 4 + in_chans (int): Number of input image channels. Default: 3 + num_classes (int): Number of classes for classification head. Default: 1000 + embed_dim (int): Patch embedding dimension. Default: 96 + depths (tuple(int)): Depth of each Swin Transformer layer. + num_heads (tuple(int)): Number of attention heads in different layers. + window_size (int): Window size. Default: 7 + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None + drop_rate (float): Dropout rate. Default: 0 + attn_drop_rate (float): Attention dropout rate. Default: 0 + drop_path_rate (float): Stochastic depth rate. Default: 0.1 + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False + patch_norm (bool): If True, add normalization after patch embedding. Default: True + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False + """ + + def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=2, + embed_dim=96, depths=[2, 2, 2, 2], depths_decoder=[1, 2, 2, 2], num_heads=[3, 6, 12, 24], + window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, + norm_layer=nn.LayerNorm, ape=False, patch_norm=True, + use_checkpoint=False, final_upsample="expand_first", **kwargs): + super().__init__() + + print("SwinTransformerSys expand initial----depths:{};depths_decoder:{};drop_path_rate:{};num_classes:{}".format(depths, + depths_decoder,drop_path_rate,num_classes)) + + self.num_classes = num_classes + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) + self.num_features_up = int(embed_dim * 2) + self.mlp_ratio = mlp_ratio + self.final_upsample = final_upsample + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None) + num_patches = self.patch_embed.num_patches + patches_resolution = self.patch_embed.patches_resolution + self.patches_resolution = patches_resolution + + # absolute position embedding + if self.ape: + self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) + trunc_normal_(self.absolute_pos_embed, std=.02) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule + + # build encoder and bottleneck layers + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer), + input_resolution=(patches_resolution[0] // (2 ** i_layer), + patches_resolution[1] // (2 ** i_layer)), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=self.mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + use_checkpoint=use_checkpoint) + self.layers.append(layer) + + # build decoder layers + self.layers_up = nn.ModuleList() + self.concat_back_dim = nn.ModuleList() + for i_layer in range(self.num_layers): + concat_linear = nn.Linear(2*int(embed_dim*2**(self.num_layers-1-i_layer)), + int(embed_dim*2**(self.num_layers-1-i_layer))) if i_layer > 0 else nn.Identity() + if i_layer ==0 : + layer_up = PatchExpand(input_resolution=(patches_resolution[0] // (2 ** (self.num_layers-1-i_layer)), + patches_resolution[1] // (2 ** (self.num_layers-1-i_layer))), dim=int(embed_dim * 2 ** (self.num_layers-1-i_layer)), dim_scale=2, norm_layer=norm_layer) + else: + layer_up = BasicLayer_up(dim=int(embed_dim * 2 ** (self.num_layers-1-i_layer)), + input_resolution=(patches_resolution[0] // (2 ** (self.num_layers-1-i_layer)), + patches_resolution[1] // (2 ** (self.num_layers-1-i_layer))), + depth=depths[(self.num_layers-1-i_layer)], + num_heads=num_heads[(self.num_layers-1-i_layer)], + window_size=window_size, + mlp_ratio=self.mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:(self.num_layers-1-i_layer)]):sum(depths[:(self.num_layers-1-i_layer) + 1])], + norm_layer=norm_layer, + upsample=PatchExpand if (i_layer < self.num_layers - 1) else None, + use_checkpoint=use_checkpoint) + self.layers_up.append(layer_up) + self.concat_back_dim.append(concat_linear) + + self.norm = norm_layer(self.num_features) + self.norm_up= norm_layer(self.embed_dim) + + if self.final_upsample == "expand_first": + print("---final upsample expand_first---") + self.up = FinalPatchExpand_X4(input_resolution=(img_size//patch_size,img_size//patch_size),dim_scale=4,dim=embed_dim) + self.output = nn.Conv2d(in_channels=embed_dim,out_channels=self.num_classes,kernel_size=1,bias=False) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'absolute_pos_embed'} + + @torch.jit.ignore + def no_weight_decay_keywords(self): + return {'relative_position_bias_table'} + + #Encoder and Bottleneck + def forward_features(self, x): + x = self.patch_embed(x) + if self.ape: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + x_downsample = [] + + for layer in self.layers: + x_downsample.append(x) + x = layer(x) + + x = self.norm(x) # B L C + + return x, x_downsample + + #Dencoder and Skip connection + def forward_up_features(self, x, x_downsample): + for inx, layer_up in enumerate(self.layers_up): + if inx == 0: + x = layer_up(x) + else: + x = torch.cat([x,x_downsample[3-inx]],-1) + x = self.concat_back_dim[inx](x) + x = layer_up(x) + + x = self.norm_up(x) # B L C + + return x + + def up_x4(self, x): + H, W = self.patches_resolution + # print(H, W) + B, L, C = x.shape + # print(B, L, C) + assert L == H*W, "input features has wrong size" + + if self.final_upsample=="expand_first": + x = self.up(x) + x = x.view(B,4*H,4*W,-1) + x = x.permute(0,3,1,2) #B,C,H,W + x = self.output(x) + + return x + + def forward(self, x): + x, x_downsample = self.forward_features(x) + x = self.forward_up_features(x,x_downsample) # B, L, C + x = self.up_x4(x) + + return x + + def flops(self): + flops = 0 + flops += self.patch_embed.flops() + for i, layer in enumerate(self.layers): + flops += layer.flops() + flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers) + flops += self.num_features * self.num_classes + return flops diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/networks/vision_transformer.py b/PuzzleTuning/SSL_structures/Swin_Unet_main/networks/vision_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..889f8c91aaf509f10b5ead4c310e3e2a7a0d11ae --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/networks/vision_transformer.py @@ -0,0 +1,89 @@ +# coding=utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import logging +import math + +from os.path import join as pjoin + +import torch +import torch.nn as nn +import numpy as np + +from torch.nn import CrossEntropyLoss, Dropout, Softmax, Linear, Conv2d, LayerNorm +from torch.nn.modules.utils import _pair +from scipy import ndimage +from .swin_transformer_unet_skip_expand_decoder_sys import SwinTransformerSys + +logger = logging.getLogger(__name__) + +class SwinUnet(nn.Module): + def __init__(self, img_size=224, num_classes=2, zero_head=False, vis=False, patch_size=16): + super(SwinUnet, self).__init__() + self.num_classes = num_classes + self.zero_head = zero_head + + + self.swin_unet = SwinTransformerSys(img_size=img_size, + patch_size=4, + in_chans=3, + num_classes=num_classes, + embed_dim=96, + depths=[2, 2, 2, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0, + drop_path_rate=0.1, + ape=False, + patch_norm=True, + use_checkpoint=False) + + def forward(self, x): + if x.size()[1] == 1: + x = x.repeat(1,3,1,1) + logits = self.swin_unet(x) + return logits + + # def load_from(self, config): + # pretrained_path = config.MODEL.PRETRAIN_CKPT + # if pretrained_path is not None: + # print("pretrained_path:{}".format(pretrained_path)) + # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + # pretrained_dict = torch.load(pretrained_path, map_location=device) + # if "model" not in pretrained_dict: + # print("---start load pretrained modle by splitting---") + # pretrained_dict = {k[17:]:v for k,v in pretrained_dict.items()} + # for k in list(pretrained_dict.keys()): + # if "output" in k: + # print("delete key:{}".format(k)) + # del pretrained_dict[k] + # msg = self.swin_unet.load_state_dict(pretrained_dict,strict=False) + # # print(msg) + # return + # pretrained_dict = pretrained_dict['model'] + # print("---start load pretrained modle of swin encoder---") + # + # model_dict = self.swin_unet.state_dict() + # full_dict = copy.deepcopy(pretrained_dict) + # for k, v in pretrained_dict.items(): + # if "layers." in k: + # current_layer_num = 3-int(k[7:8]) + # current_k = "layers_up." + str(current_layer_num) + k[8:] + # full_dict.update({current_k:v}) + # for k in list(full_dict.keys()): + # if k in model_dict: + # if full_dict[k].shape != model_dict[k].shape: + # print("delete:{};shape pretrain:{};shape model:{}".format(k,v.shape,model_dict[k].shape)) + # del full_dict[k] + # + # msg = self.swin_unet.load_state_dict(full_dict, strict=False) + # # print(msg) + # else: + # print("none pretrain") + # \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/requirements.txt b/PuzzleTuning/SSL_structures/Swin_Unet_main/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4abfe422e0bd10ed594596292121fb6eac4d4581 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/requirements.txt @@ -0,0 +1,11 @@ +torch==1.4.0 +torchvision==0.5.0 +numpy +tqdm +tensorboard +tensorboardX +ml-collections +medpy +SimpleITK +scipy +h5py diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/test.py b/PuzzleTuning/SSL_structures/Swin_Unet_main/test.py new file mode 100644 index 0000000000000000000000000000000000000000..7c0535bf9d729a3fb9355d4cd4fcac18c99c31bc --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/test.py @@ -0,0 +1,141 @@ +import argparse +import logging +import os +import random +import sys +import numpy as np +import torch +import torch.backends.cudnn as cudnn +import torch.nn as nn +from torch.utils.data import DataLoader +from tqdm import tqdm +from datasets.dataset_synapse import Synapse_dataset +from utils import test_single_volume +from networks.vision_transformer import SwinUnet as ViT_seg +from trainer import trainer_synapse +from config import get_config + +parser = argparse.ArgumentParser() +parser.add_argument('--volume_path', type=str, + default='../data/Synapse/test_vol_h5', help='root dir for validation volume data') # for acdc volume_path=root_dir +parser.add_argument('--dataset', type=str, + default='Synapse', help='experiment_name') +parser.add_argument('--num_classes', type=int, + default=9, help='output channel of network') +parser.add_argument('--list_dir', type=str, + default='./lists/lists_Synapse', help='list dir') +parser.add_argument('--output_dir', type=str, help='output dir') +parser.add_argument('--max_iterations', type=int,default=30000, help='maximum epoch number to train') +parser.add_argument('--max_epochs', type=int, default=150, help='maximum epoch number to train') +parser.add_argument('--batch_size', type=int, default=24, + help='batch_size per gpu') +parser.add_argument('--img_size', type=int, default=224, help='input patch size of network input') +parser.add_argument('--is_savenii', action="store_true", help='whether to save results during inference') +parser.add_argument('--test_save_dir', type=str, default='../predictions', help='saving prediction as nii!') +parser.add_argument('--deterministic', type=int, default=1, help='whether use deterministic training') +parser.add_argument('--base_lr', type=float, default=0.01, help='segmentation network learning rate') +parser.add_argument('--seed', type=int, default=1234, help='random seed') +parser.add_argument('--cfg', type=str, required=True, metavar="FILE", help='path to config file', ) +parser.add_argument( + "--opts", + help="Modify config options by adding 'KEY VALUE' pairs. ", + default=None, + nargs='+', + ) +parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset') +parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'], + help='no: no cache, ' + 'full: cache all data, ' + 'part: sharding the dataset into nonoverlapping pieces and only cache one piece') +parser.add_argument('--resume', help='resume from checkpoint') +parser.add_argument('--accumulation-steps', type=int, help="gradient accumulation steps") +parser.add_argument('--use-checkpoint', action='store_true', + help="whether to use gradient checkpointing to save memory") +parser.add_argument('--amp-opt-level', type=str, default='O1', choices=['O0', 'O1', 'O2'], + help='mixed precision opt level, if O0, no amp is used') +parser.add_argument('--tag', help='tag of experiment') +parser.add_argument('--eval', action='store_true', help='Perform evaluation only') +parser.add_argument('--throughput', action='store_true', help='Test throughput only') + +args = parser.parse_args() +if args.dataset == "Synapse": + args.volume_path = os.path.join(args.volume_path, "test_vol_h5") +config = get_config(args) + + +def inference(args, model, test_save_path=None): + db_test = args.Dataset(base_dir=args.volume_path, split="test_vol", list_dir=args.list_dir) + testloader = DataLoader(db_test, batch_size=1, shuffle=False, num_workers=1) + logging.info("{} test iterations per epoch".format(len(testloader))) + model.eval() + metric_list = 0.0 + for i_batch, sampled_batch in tqdm(enumerate(testloader)): + h, w = sampled_batch["image"].size()[2:] + image, label, case_name = sampled_batch["image"], sampled_batch["label"], sampled_batch['case_name'][0] + metric_i = test_single_volume(image, label, model, classes=args.num_classes, patch_size=[args.img_size, args.img_size], + test_save_path=test_save_path, case=case_name, z_spacing=args.z_spacing) + metric_list += np.array(metric_i) + logging.info('idx %d case %s mean_dice %f mean_hd95 %f' % (i_batch, case_name, np.mean(metric_i, axis=0)[0], np.mean(metric_i, axis=0)[1])) + metric_list = metric_list / len(db_test) + for i in range(1, args.num_classes): + logging.info('Mean class %d mean_dice %f mean_hd95 %f' % (i, metric_list[i-1][0], metric_list[i-1][1])) + performance = np.mean(metric_list, axis=0)[0] + mean_hd95 = np.mean(metric_list, axis=0)[1] + logging.info('Testing performance in best val model: mean_dice : %f mean_hd95 : %f' % (performance, mean_hd95)) + return "Testing Finished!" + + +if __name__ == "__main__": + + if not args.deterministic: + cudnn.benchmark = True + cudnn.deterministic = False + else: + cudnn.benchmark = False + cudnn.deterministic = True + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed(args.seed) + + dataset_config = { + 'Synapse': { + 'Dataset': Synapse_dataset, + 'volume_path': args.volume_path, + 'list_dir': './lists/lists_Synapse', + 'num_classes': 9, + 'z_spacing': 1, + }, + } + dataset_name = args.dataset + args.num_classes = dataset_config[dataset_name]['num_classes'] + args.volume_path = dataset_config[dataset_name]['volume_path'] + args.Dataset = dataset_config[dataset_name]['Dataset'] + args.list_dir = dataset_config[dataset_name]['list_dir'] + args.z_spacing = dataset_config[dataset_name]['z_spacing'] + args.is_pretrain = True + + net = ViT_seg(config, img_size=args.img_size, num_classes=args.num_classes).cuda() + + snapshot = os.path.join(args.output_dir, 'best_model.pth') + if not os.path.exists(snapshot): snapshot = snapshot.replace('best_model', 'epoch_'+str(args.max_epochs-1)) + msg = net.load_state_dict(torch.load(snapshot)) + print("self trained swin unet",msg) + snapshot_name = snapshot.split('/')[-1] + + log_folder = './test_log/test_log_' + os.makedirs(log_folder, exist_ok=True) + logging.basicConfig(filename=log_folder + '/'+snapshot_name+".txt", level=logging.INFO, format='[%(asctime)s.%(msecs)03d] %(message)s', datefmt='%H:%M:%S') + logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) + logging.info(str(args)) + logging.info(snapshot_name) + + if args.is_savenii: + args.test_save_dir = os.path.join(args.output_dir, "predictions") + test_save_path = args.test_save_dir + os.makedirs(test_save_path, exist_ok=True) + else: + test_save_path = None + inference(args, net, test_save_path) + + diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/test.sh b/PuzzleTuning/SSL_structures/Swin_Unet_main/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..2370296d36852a5707555ee9bfd648e11fcd777b --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/test.sh @@ -0,0 +1,52 @@ +#!/bin/bash +if [$epoch_time] +then + EPOCH_TIME = $epoch_time +else + EPOCH_TIME = 150 +fi + +if [$out_dir] +then + OUT_DIR = $out_dir +else + OUT_DIR = './model_out' +fi + +if [$cfg] +then + CFG = $cfg +else + CFG = 'configs/swin_tiny_patch4_window7_224_lite.yaml' +fi + +if [$data_dir] +then + DATA_DIR = $data_dir +else + DATA_DIR = 'datasets/Synapse' +fi + +if [$learning_rate] +then + LEARNING_RATE = $learning_rate +else + LEARNING_RATE = 0.05 +fi + +if [$img_size] +then + IMG_SIZE = $img_size +else + IMG_SIZE = 224 +fi + +if [$batch_size] +then + BATCH_SIZE = $batch_size +else + BATCH_SIZE = 24 +fi + +echo "start test model" +pyhton test.py --dataset Synapse --cfg $CFG --is_saveni --volume_path $DATA_DIR --max_epochs $EPOCH_TIME --output_dir $OUT_DIR --img_size $IMG_SIZE --base_lr $LEARNING_RATE --batch_size $BATCH_SIZE \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/train.py b/PuzzleTuning/SSL_structures/Swin_Unet_main/train.py new file mode 100644 index 0000000000000000000000000000000000000000..56415d06b9bfe25ae4341c37615d4a51d18dc26d --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/train.py @@ -0,0 +1,99 @@ +import argparse +import logging +import os +import random +import numpy as np +import torch +import torch.backends.cudnn as cudnn +from networks.vision_transformer import SwinUnet as ViT_seg +from trainer import trainer_synapse +from config import get_config + +parser = argparse.ArgumentParser() +parser.add_argument('--root_path', type=str, + default='../data/Synapse/train_npz', help='root dir for data') +parser.add_argument('--dataset', type=str, + default='Synapse', help='experiment_name') +parser.add_argument('--list_dir', type=str, + default='./lists/lists_Synapse', help='list dir') +parser.add_argument('--num_classes', type=int, + default=9, help='output channel of network') +parser.add_argument('--output_dir', type=str, help='output dir') +parser.add_argument('--max_iterations', type=int, + default=30000, help='maximum epoch number to train') +parser.add_argument('--max_epochs', type=int, + default=150, help='maximum epoch number to train') +parser.add_argument('--batch_size', type=int, + default=24, help='batch_size per gpu') +parser.add_argument('--n_gpu', type=int, default=1, help='total gpu') +parser.add_argument('--deterministic', type=int, default=1, + help='whether use deterministic training') +parser.add_argument('--base_lr', type=float, default=0.01, + help='segmentation network learning rate') +parser.add_argument('--img_size', type=int, + default=224, help='input patch size of network input') +parser.add_argument('--seed', type=int, + default=1234, help='random seed') +parser.add_argument('--cfg', type=str, required=True, metavar="FILE", help='path to config file', ) +parser.add_argument( + "--opts", + help="Modify config options by adding 'KEY VALUE' pairs. ", + default=None, + nargs='+', + ) +parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset') +parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'], + help='no: no cache, ' + 'full: cache all data, ' + 'part: sharding the dataset into nonoverlapping pieces and only cache one piece') +parser.add_argument('--resume', help='resume from checkpoint') +parser.add_argument('--accumulation-steps', type=int, help="gradient accumulation steps") +parser.add_argument('--use-checkpoint', action='store_true', + help="whether to use gradient checkpointing to save memory") +parser.add_argument('--amp-opt-level', type=str, default='O1', choices=['O0', 'O1', 'O2'], + help='mixed precision opt level, if O0, no amp is used') +parser.add_argument('--tag', help='tag of experiment') +parser.add_argument('--eval', action='store_true', help='Perform evaluation only') +parser.add_argument('--throughput', action='store_true', help='Test throughput only') + +args = parser.parse_args() +if args.dataset == "Synapse": + args.root_path = os.path.join(args.root_path, "train_npz") +config = get_config(args) + + +if __name__ == "__main__": + if not args.deterministic: + cudnn.benchmark = True + cudnn.deterministic = False + else: + cudnn.benchmark = False + cudnn.deterministic = True + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed(args.seed) + + dataset_name = args.dataset + dataset_config = { + 'Synapse': { + 'root_path': args.root_path, + 'list_dir': './lists/lists_Synapse', + 'num_classes': 9, + }, + } + + if args.batch_size != 24 and args.batch_size % 6 == 0: + args.base_lr *= args.batch_size / 24 + args.num_classes = dataset_config[dataset_name]['num_classes'] + args.root_path = dataset_config[dataset_name]['root_path'] + args.list_dir = dataset_config[dataset_name]['list_dir'] + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + net = ViT_seg(config, img_size=args.img_size, num_classes=args.num_classes).cuda() + net.load_from(config) + + trainer = {'Synapse': trainer_synapse,} + trainer[dataset_name](args, net, args.output_dir) \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/train.sh b/PuzzleTuning/SSL_structures/Swin_Unet_main/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..f0f967beb4d3e2373bee9f025d25dc7f23894bf2 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/train.sh @@ -0,0 +1,52 @@ +#!/bin/bash +if [$epoch_time] +then + EPOCH_TIME = $epoch_time +else + EPOCH_TIME = 150 +fi + +if [$out_dir] +then + OUT_DIR = $out_dir +else + OUT_DIR = './model_out' +fi + +if [$cfg] +then + CFG = $cfg +else + CFG = 'configs/swin_tiny_patch4_window7_224_lite.yaml' +fi + +if [$data_dir] +then + DATA_DIR = $data_dir +else + DATA_DIR = 'datasets/Synapse' +fi + +if [$learning_rate] +then + LEARNING_RATE = $learning_rate +else + LEARNING_RATE = 0.05 +fi + +if [$img_size] +then + IMG_SIZE = $img_size +else + IMG_SIZE = 224 +fi + +if [$batch_size] +then + BATCH_SIZE = $batch_size +else + BATCH_SIZE = 24 +fi + +echo "start train model" +pyhton train.py --dataset Synapse --cfg $CFG --root_path $DATA_DIR --max_epochs $EPOCH_TIME --output_dir $OUT_DIR --img_size $IMG_SIZE --base_lr $LEARNING_RATE --batch_size $BATCH_SIZE \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/trainer.py b/PuzzleTuning/SSL_structures/Swin_Unet_main/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..658a6a2708c1ba856b4f40545da0deba47f879e6 --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/trainer.py @@ -0,0 +1,97 @@ +import argparse +import logging +import os +import random +import sys +import time +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +from tensorboardX import SummaryWriter +from torch.nn.modules.loss import CrossEntropyLoss +from torch.utils.data import DataLoader +from tqdm import tqdm +from utils import DiceLoss +from torchvision import transforms +from utils import test_single_volume + +def trainer_synapse(args, model, snapshot_path): + from datasets.dataset_synapse import Synapse_dataset, RandomGenerator + logging.basicConfig(filename=snapshot_path + "/log.txt", level=logging.INFO, + format='[%(asctime)s.%(msecs)03d] %(message)s', datefmt='%H:%M:%S') + logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) + logging.info(str(args)) + base_lr = args.base_lr + num_classes = args.num_classes + batch_size = args.batch_size * args.n_gpu + # max_iterations = args.max_iterations + db_train = Synapse_dataset(base_dir=args.root_path, list_dir=args.list_dir, split="train", + transform=transforms.Compose( + [RandomGenerator(output_size=[args.img_size, args.img_size])])) + print("The length of train set is: {}".format(len(db_train))) + + def worker_init_fn(worker_id): + random.seed(args.seed + worker_id) + + trainloader = DataLoader(db_train, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True, + worker_init_fn=worker_init_fn) + if args.n_gpu > 1: + model = nn.DataParallel(model) + model.train() + ce_loss = CrossEntropyLoss() + dice_loss = DiceLoss(num_classes) + optimizer = optim.SGD(model.parameters(), lr=base_lr, momentum=0.9, weight_decay=0.0001) + writer = SummaryWriter(snapshot_path + '/log') + iter_num = 0 + max_epoch = args.max_epochs + max_iterations = args.max_epochs * len(trainloader) # max_epoch = max_iterations // len(trainloader) + 1 + logging.info("{} iterations per epoch. {} max iterations ".format(len(trainloader), max_iterations)) + best_performance = 0.0 + iterator = tqdm(range(max_epoch), ncols=70) + for epoch_num in iterator: + for i_batch, sampled_batch in enumerate(trainloader): + image_batch, label_batch = sampled_batch['image'], sampled_batch['label'] + image_batch, label_batch = image_batch.cuda(), label_batch.cuda() + outputs = model(image_batch) + loss_ce = ce_loss(outputs, label_batch[:].long()) + loss_dice = dice_loss(outputs, label_batch, softmax=True) + loss = 0.4 * loss_ce + 0.6 * loss_dice + optimizer.zero_grad() + loss.backward() + optimizer.step() + lr_ = base_lr * (1.0 - iter_num / max_iterations) ** 0.9 + for param_group in optimizer.param_groups: + param_group['lr'] = lr_ + + iter_num = iter_num + 1 + writer.add_scalar('info/lr', lr_, iter_num) + writer.add_scalar('info/total_loss', loss, iter_num) + writer.add_scalar('info/loss_ce', loss_ce, iter_num) + + logging.info('iteration %d : loss : %f, loss_ce: %f' % (iter_num, loss.item(), loss_ce.item())) + + if iter_num % 20 == 0: + image = image_batch[1, 0:1, :, :] + image = (image - image.min()) / (image.max() - image.min()) + writer.add_image('train/Image', image, iter_num) + outputs = torch.argmax(torch.softmax(outputs, dim=1), dim=1, keepdim=True) + writer.add_image('train/Prediction', outputs[1, ...] * 50, iter_num) + labs = label_batch[1, ...].unsqueeze(0) * 50 + writer.add_image('train/GroundTruth', labs, iter_num) + + save_interval = 50 # int(max_epoch/6) + if epoch_num > int(max_epoch / 2) and (epoch_num + 1) % save_interval == 0: + save_mode_path = os.path.join(snapshot_path, 'epoch_' + str(epoch_num) + '.pth') + torch.save(model.state_dict(), save_mode_path) + logging.info("save model to {}".format(save_mode_path)) + + if epoch_num >= max_epoch - 1: + save_mode_path = os.path.join(snapshot_path, 'epoch_' + str(epoch_num) + '.pth') + torch.save(model.state_dict(), save_mode_path) + logging.info("save model to {}".format(save_mode_path)) + iterator.close() + break + + writer.close() + return "Training Finished!" \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/Swin_Unet_main/utils.py b/PuzzleTuning/SSL_structures/Swin_Unet_main/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0e3a1bf9ad7058f506faff0a8ae4f6056514575c --- /dev/null +++ b/PuzzleTuning/SSL_structures/Swin_Unet_main/utils.py @@ -0,0 +1,102 @@ +import numpy as np +import torch +from medpy import metric +from scipy.ndimage import zoom +import torch.nn as nn +import SimpleITK as sitk + + +class DiceLoss(nn.Module): + def __init__(self, n_classes): + super(DiceLoss, self).__init__() + self.n_classes = n_classes + + def _one_hot_encoder(self, input_tensor): + tensor_list = [] + for i in range(self.n_classes): + temp_prob = input_tensor == i # * torch.ones_like(input_tensor) + tensor_list.append(temp_prob.unsqueeze(1)) + output_tensor = torch.cat(tensor_list, dim=1) + return output_tensor.float() + + def _dice_loss(self, score, target): + target = target.float() + smooth = 1e-5 + intersect = torch.sum(score * target) + y_sum = torch.sum(target * target) + z_sum = torch.sum(score * score) + loss = (2 * intersect + smooth) / (z_sum + y_sum + smooth) + loss = 1 - loss + return loss + + def forward(self, inputs, target, weight=None, softmax=False): + if softmax: + inputs = torch.softmax(inputs, dim=1) + target = self._one_hot_encoder(target) + if weight is None: + weight = [1] * self.n_classes + assert inputs.size() == target.size(), 'predict {} & target {} shape do not match'.format(inputs.size(), target.size()) + class_wise_dice = [] + loss = 0.0 + for i in range(0, self.n_classes): + dice = self._dice_loss(inputs[:, i], target[:, i]) + class_wise_dice.append(1.0 - dice.item()) + loss += dice * weight[i] + return loss / self.n_classes + + +def calculate_metric_percase(pred, gt): + pred[pred > 0] = 1 + gt[gt > 0] = 1 + if pred.sum() > 0 and gt.sum()>0: + dice = metric.binary.dc(pred, gt) + hd95 = metric.binary.hd95(pred, gt) + return dice, hd95 + elif pred.sum() > 0 and gt.sum()==0: + return 1, 0 + else: + return 0, 0 + + +def test_single_volume(image, label, net, classes, patch_size=[256, 256], test_save_path=None, case=None, z_spacing=1): + image, label = image.squeeze(0).cpu().detach().numpy(), label.squeeze(0).cpu().detach().numpy() + if len(image.shape) == 3: + prediction = np.zeros_like(label) + for ind in range(image.shape[0]): + slice = image[ind, :, :] + x, y = slice.shape[0], slice.shape[1] + if x != patch_size[0] or y != patch_size[1]: + slice = zoom(slice, (patch_size[0] / x, patch_size[1] / y), order=3) # previous using 0 + input = torch.from_numpy(slice).unsqueeze(0).unsqueeze(0).float().cuda() + net.eval() + with torch.no_grad(): + outputs = net(input) + out = torch.argmax(torch.softmax(outputs, dim=1), dim=1).squeeze(0) + out = out.cpu().detach().numpy() + if x != patch_size[0] or y != patch_size[1]: + pred = zoom(out, (x / patch_size[0], y / patch_size[1]), order=0) + else: + pred = out + prediction[ind] = pred + else: + input = torch.from_numpy(image).unsqueeze( + 0).unsqueeze(0).float().cuda() + net.eval() + with torch.no_grad(): + out = torch.argmax(torch.softmax(net(input), dim=1), dim=1).squeeze(0) + prediction = out.cpu().detach().numpy() + metric_list = [] + for i in range(1, classes): + metric_list.append(calculate_metric_percase(prediction == i, label == i)) + + if test_save_path is not None: + img_itk = sitk.GetImageFromArray(image.astype(np.float32)) + prd_itk = sitk.GetImageFromArray(prediction.astype(np.float32)) + lab_itk = sitk.GetImageFromArray(label.astype(np.float32)) + img_itk.SetSpacing((1, 1, z_spacing)) + prd_itk.SetSpacing((1, 1, z_spacing)) + lab_itk.SetSpacing((1, 1, z_spacing)) + sitk.WriteImage(prd_itk, test_save_path + '/'+case + "_pred.nii.gz") + sitk.WriteImage(img_itk, test_save_path + '/'+ case + "_img.nii.gz") + sitk.WriteImage(lab_itk, test_save_path + '/'+ case + "_gt.nii.gz") + return metric_list \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/TransMUNet_main/TransMUNet.py b/PuzzleTuning/SSL_structures/TransMUNet_main/TransMUNet.py new file mode 100644 index 0000000000000000000000000000000000000000..fe56bce811ee0433a92602ad882358cf3234b33b --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransMUNet_main/TransMUNet.py @@ -0,0 +1,189 @@ +import torch +import torch.nn as nn +import torchvision + +resnet = torchvision.models.resnet.resnet50(pretrained=True) +from .munet_transformer import transmunet +import cv2 +import numpy as np + + +class ConvBlock(nn.Module): + """ + Helper module that consists of a Conv -> BN -> ReLU + """ + + def __init__(self, in_channels, out_channels, padding=1, kernel_size=3, stride=1, with_nonlinearity=True): + super().__init__() + self.conv = nn.Conv2d(in_channels, out_channels, padding=padding, kernel_size=kernel_size, stride=stride) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + self.with_nonlinearity = with_nonlinearity + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.with_nonlinearity: + x = self.relu(x) + return x + + +class Bridge(nn.Module): + """ + This is the middle layer of the UNet which just consists of some + """ + + def __init__(self, in_channels, out_channels): + super().__init__() + self.bridge = nn.Sequential( + ConvBlock(in_channels, out_channels), + ConvBlock(out_channels, out_channels) + ) + + def forward(self, x): + return self.bridge(x) + + +class UpBlockForUNetWithResNet50(nn.Module): + """ + Up block that encapsulates one up-sampling step which consists of Upsample -> ConvBlock -> ConvBlock + """ + + def __init__(self, in_channels, out_channels, up_conv_in_channels=None, up_conv_out_channels=None, + upsampling_method="conv_transpose"): + super().__init__() + + if up_conv_in_channels == None: + up_conv_in_channels = in_channels + if up_conv_out_channels == None: + up_conv_out_channels = out_channels + + if upsampling_method == "conv_transpose": + self.upsample = nn.ConvTranspose2d(up_conv_in_channels, up_conv_out_channels, kernel_size=2, stride=2) + elif upsampling_method == "bilinear": + self.upsample = nn.Sequential( + nn.Upsample(mode='bilinear', scale_factor=2), + nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1) + ) + self.conv_block_1 = ConvBlock(in_channels, out_channels) + self.conv_block_2 = ConvBlock(out_channels, out_channels) + + def forward(self, up_x, down_x): + """ + + :param up_x: this is the output from the previous up block + :param down_x: this is the output from the down block + :return: upsampled feature map + """ + x = self.upsample(up_x) + x = torch.cat([x, down_x], 1) + x = self.conv_block_1(x) + x = self.conv_block_2(x) + return x + + +class SE_Block(nn.Module): + def __init__(self, c, r=16): + super().__init__() + self.squeeze = nn.AdaptiveAvgPool2d(1) + self.excitation = nn.Sequential( + nn.Linear(c, c // r, bias=False), + nn.ReLU(inplace=True), + nn.Linear(c // r, c, bias=False), + nn.Sigmoid() + ) + + def forward(self, x): + bs, c, _, _ = x.shape + y = self.squeeze(x).view(bs, c) + y = self.excitation(y).view(bs, c, 1, 1) + x = x * y.expand_as(x) + return y + + +class TransMUNet(nn.Module): + DEPTH = 6 + + def __init__(self, n_classes=2, + patch_size: int = 16, + emb_size: int = 512, + img_size: int = 256, + n_channels=3, + depth: int = 4, + n_regions: int = (256 // 16) ** 2, + output_ch: int = 1, + bilinear=True): + super().__init__() + self.n_classes = n_classes + self.transformer = transmunet(in_channels=n_channels, + patch_size=patch_size, + emb_size=emb_size, + img_size=img_size, + depth=depth, + n_regions=n_regions) + resnet = torchvision.models.resnet.resnet50(pretrained=True) + down_blocks = [] + up_blocks = [] + self.input_block = nn.Sequential(*list(resnet.children()))[:3] + self.input_pool = list(resnet.children())[3] + for bottleneck in list(resnet.children()): + if isinstance(bottleneck, nn.Sequential): + down_blocks.append(bottleneck) + self.down_blocks = nn.ModuleList(down_blocks) + self.bridge = Bridge(2048, 2048) + up_blocks.append(UpBlockForUNetWithResNet50(2048, 1024)) + up_blocks.append(UpBlockForUNetWithResNet50(1024, 512)) + up_blocks.append(UpBlockForUNetWithResNet50(512, 256)) + up_blocks.append(UpBlockForUNetWithResNet50(in_channels=128 + 64, out_channels=128, + up_conv_in_channels=256, up_conv_out_channels=128)) + up_blocks.append(UpBlockForUNetWithResNet50(in_channels=64 + 3, out_channels=64, + up_conv_in_channels=128, up_conv_out_channels=64)) + + self.up_blocks = nn.ModuleList(up_blocks) + + self.out = nn.Conv2d(128, n_classes, kernel_size=1, stride=1) + + self.boundary = nn.Sequential(nn.Conv2d(64, 32, kernel_size=1, stride=1), + nn.BatchNorm2d(32), nn.ReLU(inplace=True), + nn.Conv2d(32, 1, kernel_size=1, stride=1, bias=False), + nn.Sigmoid()) + + self.se = SE_Block(c=64) + + def forward(self, x, with_additional=False): + [global_contexual, regional_distribution, region_coeff] = self.transformer(x) + + pre_pools = dict() + pre_pools[f"layer_0"] = x + x = self.input_block(x) + pre_pools[f"layer_1"] = x + x = self.input_pool(x) + + for i, block in enumerate(self.down_blocks, 2): + x = block(x) + if i == (TransMUNet.DEPTH - 1): + continue + pre_pools[f"layer_{i}"] = x + + x = self.bridge(x) + + for i, block in enumerate(self.up_blocks, 1): + key = f"layer_{TransMUNet.DEPTH - 1 - i}" + x = block(x, pre_pools[key]) + + B_out = self.boundary(x) + B = B_out.repeat_interleave(int(x.shape[1]), dim=1) + x = self.se(x) + x = x + B + att = regional_distribution.repeat_interleave(int(x.shape[1]), dim=1) + x = x * att + x = torch.cat((x, global_contexual), dim=1) + x = self.out(x) + # print(x.shape) + del pre_pools + x = torch.sigmoid(x) + # print('x shape: ', x.shape) + if with_additional: + return x, B_out, region_coeff + else: + return x diff --git a/PuzzleTuning/SSL_structures/TransMUNet_main/munet_transformer.py b/PuzzleTuning/SSL_structures/TransMUNet_main/munet_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..49345b3ed27e52f0999519beb44a8e0189b67d65 --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransMUNet_main/munet_transformer.py @@ -0,0 +1,219 @@ +import torch +import torch.nn.functional as F +from torch import nn +from torch import Tensor +from einops import rearrange, reduce, repeat +from einops.layers.torch import Rearrange, Reduce + + +class DoubleConv(nn.Module): + def __init__(self, in_channels, out_channels, mid_channels=None): + super().__init__() + if not mid_channels: + mid_channels = out_channels + self.double_conv = nn.Sequential( + nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1), + nn.BatchNorm2d(mid_channels), + nn.ReLU(inplace=True), + nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1), + nn.BatchNorm2d(out_channels), + nn.ReLU(inplace=True), + nn.MaxPool2d(2) + ) + + def forward(self, x): + return self.double_conv(x) + + +class Encoder_patch(nn.Module): + def __init__(self, n_channels, emb_size=512, bilinear=True): + super(Encoder_patch, self).__init__() + self.n_channels = n_channels + self.emb_size = emb_size + self.bilinear = bilinear + + self.conv1 = DoubleConv(n_channels, 128) + self.conv2 = DoubleConv(128, 256) + self.conv3 = DoubleConv(256, emb_size) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + x = torch.flatten(torch.nn.functional.adaptive_avg_pool2d(x, 1), start_dim=1) + return x + + +class PatchEmbedding(nn.Module): + def __init__(self, in_channels: int = 3, patch_size: int = 16, emb_size: int = 768, img_size: int = 224): + self.patch_size = patch_size + super().__init__() + # self.encoder = Encoder_patch(n_channels = in_channels, emb_size= emb_size) + self.projection = nn.Sequential( + Rearrange('b c (ph h) (pw w) -> b c (ph pw) h w', c=in_channels, h=patch_size, ph=img_size // patch_size, + w=patch_size, pw=img_size // patch_size), + Rearrange('b c p h w -> (b p) c h w'), + Encoder_patch(n_channels=in_channels, emb_size=emb_size), + Rearrange('(b p) d-> b p d', p=(img_size // patch_size) ** 2), + ) + self.cls_token = nn.Parameter(torch.randn(1, 1, emb_size)) + self.positions = nn.Parameter(torch.randn((img_size // patch_size) ** 2 + 1, emb_size)) + + def forward(self, x: Tensor) -> Tensor: + b, _, _, _ = x.shape + x = self.projection(x) + cls_tokens = repeat(self.cls_token, '() n e -> b n e', b=b) + # prepend the cls token to the input + x = torch.cat([cls_tokens, x], dim=1) + # add position embedding + x += self.positions + return x + + +class MultiHeadAttention(nn.Module): + def __init__(self, emb_size: int = 768, num_heads: int = 8, dropout: float = 0): + super().__init__() + self.emb_size = emb_size + self.num_heads = num_heads + # fuse the queries, keys and values in one matrix + self.qkv = nn.Linear(emb_size, emb_size * 3) + self.att_drop = nn.Dropout(dropout) + self.projection = nn.Linear(emb_size, emb_size) + + def forward(self, x: Tensor, mask: Tensor = None) -> Tensor: + # split keys, queries and values in num_heads + qkv = rearrange(self.qkv(x), "b n (h d qkv) -> (qkv) b h n d", h=self.num_heads, qkv=3) + queries, keys, values = qkv[0], qkv[1], qkv[2] + # sum up over the last axis + energy = torch.einsum('bhqd, bhkd -> bhqk', queries, keys) # batch, num_heads, query_len, key_len + if mask is not None: + fill_value = torch.finfo(torch.float32).min + energy.mask_fill(~mask, fill_value) + + scaling = self.emb_size ** (1 / 2) + att = F.softmax(energy, dim=-1) / scaling + att = self.att_drop(att) + # sum up over the third axis + out = torch.einsum('bhal, bhlv -> bhav ', att, values) + out = rearrange(out, "b h n d -> b n (h d)") + out = self.projection(out) + return out + + +class ResidualAdd(nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, x, **kwargs): + res = x + x = self.fn(x, **kwargs) + x += res + return x + + +class FeedForwardBlock(nn.Sequential): + def __init__(self, emb_size: int, expansion: int = 4, drop_p: float = 0.): + super().__init__( + nn.Linear(emb_size, expansion * emb_size), + nn.GELU(), + nn.Dropout(drop_p), + nn.Linear(expansion * emb_size, emb_size), + ) + + +class TransformerEncoderBlock(nn.Sequential): + def __init__(self, + emb_size: int = 768, + drop_p: float = 0., + forward_expansion: int = 4, + forward_drop_p: float = 0., + **kwargs): + super().__init__( + ResidualAdd(nn.Sequential( + nn.LayerNorm(emb_size), + MultiHeadAttention(emb_size, **kwargs), + nn.Dropout(drop_p) + )), + ResidualAdd(nn.Sequential( + nn.LayerNorm(emb_size), + FeedForwardBlock( + emb_size, expansion=forward_expansion, drop_p=forward_drop_p), + nn.Dropout(drop_p) + ) + )) + + +class TransformerEncoder(nn.Sequential): + def __init__(self, depth: int = 12, **kwargs): + super().__init__(*[TransformerEncoderBlock(**kwargs) for _ in range(depth)]) + + +class dependencymap(nn.Sequential): + def __init__(self, emb_size: int = 768, n_regions: int = 256, patch_size: int = 16, img_size: int = 256, + output_ch: int = 64, cuda=True): + super().__init__() + self.patch_size = patch_size + self.img_size = img_size + self.emb_size = emb_size + self.output_ch = output_ch + self.cuda = cuda + self.outconv = nn.Sequential( + nn.Conv2d(emb_size, output_ch, kernel_size=1, padding=0), + nn.BatchNorm2d(output_ch), + nn.Sigmoid() + ) + self.out2 = nn.Sigmoid() + + self.gpool = nn.AdaptiveAvgPool1d(1) + + def forward(self, x): + x_gpool = self.gpool(x) + coeff = torch.zeros((x.size()[0], self.emb_size, self.img_size, self.img_size)) + coeff2 = torch.zeros((x.size()[0], 1, self.img_size, self.img_size)) + if self.cuda: + coeff = coeff.cuda() + coeff2 = coeff2.cuda() + for i in range(0, self.img_size // self.patch_size): + for j in range(0, self.img_size // self.patch_size): + value = x[:, (i * self.patch_size) + j] + value = value.view(value.size()[0], value.size()[1], 1, 1) + coeff[:, :, self.patch_size * i:self.patch_size * (i + 1), + self.patch_size * j:self.patch_size * (j + 1)] = value.repeat(1, 1, self.patch_size, self.patch_size) + + value = x_gpool[:, (i * self.patch_size) + j] + value = value.view(value.size()[0], value.size()[1], 1, 1) + coeff2[:, :, self.patch_size * i:self.patch_size * (i + 1), + self.patch_size * j:self.patch_size * (j + 1)] = value.repeat(1, 1, self.patch_size, self.patch_size) + + global_contexual = self.outconv(coeff) + regional_distribution = self.out2(coeff2) + return [global_contexual, regional_distribution, self.out2(x_gpool)] + + +class transmunet(nn.Sequential): + def __init__(self, + in_channels: int = 3, + patch_size: int = 16, + emb_size: int = 1024, + img_size: int = 256, + depth: int = 2, + n_regions: int = (256 // 16) ** 2, + output_ch: int = 64, + cuda=True, + **kwargs): + super().__init__( + PatchEmbedding(in_channels, patch_size, emb_size, img_size), + TransformerEncoder(depth, emb_size=emb_size, **kwargs), + dependencymap(emb_size, n_regions, patch_size, img_size, output_ch, cuda) + ) + + + + + + + + + + diff --git a/PuzzleTuning/SSL_structures/TransUNet_main/LICENSE b/PuzzleTuning/SSL_structures/TransUNet_main/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransUNet_main/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/PuzzleTuning/SSL_structures/TransUNet_main/README.md b/PuzzleTuning/SSL_structures/TransUNet_main/README.md new file mode 100644 index 0000000000000000000000000000000000000000..275d10ba506a159b8142da71265426aa81f532b2 --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransUNet_main/README.md @@ -0,0 +1,50 @@ +# TransUNet +This repo holds code for [TransUNet: Transformers Make Strong Encoders for Medical Image Segmentation](https://arxiv.org/pdf/2102.04306.pdf) + +## Usage + +### 1. Download Google pre-trained ViT models +* [Get models in this link](https://console.cloud.google.com/storage/vit_models/): R50-ViT-B_16, ViT-B_16, ViT-L_16... +```bash +wget https://storage.googleapis.com/vit_models/imagenet21k/{MODEL_NAME}.npz && +mkdir ../model/vit_checkpoint/imagenet21k && +mv {MODEL_NAME}.npz ../model/vit_checkpoint/imagenet21k/{MODEL_NAME}.npz +``` + +### 2. Prepare data + +Please go to ["./datasets/README.md"](datasets/README.md) for details, or please send an Email to jienengchen01 AT gmail.com to request the preprocessed data. If you would like to use the preprocessed data, please use it for research purposes and do not redistribute it. + +### 3. Environment + +Please prepare an environment with python=3.7, and then use the command "pip install -r requirements.txt" for the dependencies. + +### 4. Train/Test + +- Run the train script on synapse dataset. The batch size can be reduced to 12 or 6 to save memory (please also decrease the base_lr linearly), and both can reach similar performance. + +```bash +CUDA_VISIBLE_DEVICES=0 python train.py --dataset Synapse --vit_name R50-ViT-B_16 +``` + +- Run the test script on synapse dataset. It supports testing for both 2D images and 3D volumes. + +```bash +python test.py --dataset Synapse --vit_name R50-ViT-B_16 +``` + +## Reference +* [Google ViT](https://github.com/google-research/vision_transformer) +* [ViT-pytorch](https://github.com/jeonsworld/ViT-pytorch) +* [segmentation_models.pytorch](https://github.com/qubvel/segmentation_models.pytorch) + +## Citations + +```bibtex +@article{chen2021transunet, + title={TransUNet: Transformers Make Strong Encoders for Medical Image Segmentation}, + author={Chen, Jieneng and Lu, Yongyi and Yu, Qihang and Luo, Xiangde and Adeli, Ehsan and Wang, Yan and Lu, Le and Yuille, Alan L., and Zhou, Yuyin}, + journal={arXiv preprint arXiv:2102.04306}, + year={2021} +} +``` diff --git a/PuzzleTuning/SSL_structures/TransUNet_main/datasets/README.md b/PuzzleTuning/SSL_structures/TransUNet_main/datasets/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c662f8e2c24f9b5a899338cfe5fcd67f43d1bba5 --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransUNet_main/datasets/README.md @@ -0,0 +1,29 @@ +# Data Preparing + +1. Access to the synapse multi-organ dataset: + 1. Sign up in the [official Synapse website](https://www.synapse.org/#!Synapse:syn3193805/wiki/) and download the dataset. Convert them to numpy format, clip the images within [-125, 275], normalize each 3D image to [0, 1], and extract 2D slices from 3D volume for training cases while keeping the 3D volume in h5 format for testing cases. + 2. You can also send an Email directly to jienengchen01 AT gmail.com to request the preprocessed data for reproduction. +2. The directory structure of the whole project is as follows: + +```bash +. +├── TransUNet +│   ├──datasets +│   │    └── dataset_*.py +│   ├──train.py +│   ├──test.py +│   └──... +├── model +│   └── vit_checkpoint +│   └── imagenet21k +│      ├── R50+ViT-B_16.npz +│      └── *.npz +└── data + └──Synapse + ├── test_vol_h5 + │   ├── case0001.npy.h5 + │   └── *.npy.h5 + └── train_npz + ├── case0005_slice000.npz + └── *.npz +``` diff --git a/PuzzleTuning/SSL_structures/TransUNet_main/datasets/dataset_synapse.py b/PuzzleTuning/SSL_structures/TransUNet_main/datasets/dataset_synapse.py new file mode 100644 index 0000000000000000000000000000000000000000..c5d0de1a99f8ca46851f51e45570d4ddc8fbff09 --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransUNet_main/datasets/dataset_synapse.py @@ -0,0 +1,75 @@ +import os +import random +import h5py +import numpy as np +import torch +from scipy import ndimage +from scipy.ndimage.interpolation import zoom +from torch.utils.data import Dataset + + +def random_rot_flip(image, label): + k = np.random.randint(0, 4) + image = np.rot90(image, k) + label = np.rot90(label, k) + axis = np.random.randint(0, 2) + image = np.flip(image, axis=axis).copy() + label = np.flip(label, axis=axis).copy() + return image, label + + +def random_rotate(image, label): + angle = np.random.randint(-20, 20) + image = ndimage.rotate(image, angle, order=0, reshape=False) + label = ndimage.rotate(label, angle, order=0, reshape=False) + return image, label + + +class RandomGenerator(object): + def __init__(self, output_size): + self.output_size = output_size + + def __call__(self, sample): + image, label = sample['image'], sample['label'] + + if random.random() > 0.5: + image, label = random_rot_flip(image, label) + elif random.random() > 0.5: + image, label = random_rotate(image, label) + x, y = image.shape + if x != self.output_size[0] or y != self.output_size[1]: + image = zoom(image, (self.output_size[0] / x, self.output_size[1] / y), order=3) # why not 3? + label = zoom(label, (self.output_size[0] / x, self.output_size[1] / y), order=0) + image = torch.from_numpy(image.astype(np.float32)).unsqueeze(0) + label = torch.from_numpy(label.astype(np.float32)) + sample = {'image': image, 'label': label.long()} + return sample + + +class Synapse_dataset(Dataset): + def __init__(self, base_dir, list_dir, split, transform=None): + self.transform = transform # using transform in torch! + self.split = split + self.sample_list = open(os.path.join(list_dir, self.split+'.txt')).readlines() + self.data_dir = base_dir + + def __len__(self): + return len(self.sample_list) + + def __getitem__(self, idx): + if self.split == "train": + slice_name = self.sample_list[idx].strip('\n') + data_path = os.path.join(self.data_dir, slice_name+'.npz') + data = np.load(data_path) + image, label = data['image'], data['label'] + else: + vol_name = self.sample_list[idx].strip('\n') + filepath = self.data_dir + "/{}.npy.h5".format(vol_name) + data = h5py.File(filepath) + image, label = data['image'][:], data['label'][:] + + sample = {'image': image, 'label': label} + if self.transform: + sample = self.transform(sample) + sample['case_name'] = self.sample_list[idx].strip('\n') + return sample diff --git a/PuzzleTuning/SSL_structures/TransUNet_main/lists/lists_Synapse/all.lst b/PuzzleTuning/SSL_structures/TransUNet_main/lists/lists_Synapse/all.lst new file mode 100644 index 0000000000000000000000000000000000000000..6ef047d4b8be2ea61d1621620e420a6f3c974ec2 --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransUNet_main/lists/lists_Synapse/all.lst @@ -0,0 +1,30 @@ +case0031.npy.h5 +case0007.npy.h5 +case0009.npy.h5 +case0005.npy.h5 +case0026.npy.h5 +case0039.npy.h5 +case0024.npy.h5 +case0034.npy.h5 +case0033.npy.h5 +case0030.npy.h5 +case0023.npy.h5 +case0040.npy.h5 +case0010.npy.h5 +case0021.npy.h5 +case0006.npy.h5 +case0027.npy.h5 +case0028.npy.h5 +case0037.npy.h5 +case0008.npy.h5 +case0022.npy.h5 +case0038.npy.h5 +case0036.npy.h5 +case0032.npy.h5 +case0002.npy.h5 +case0029.npy.h5 +case0003.npy.h5 +case0001.npy.h5 +case0004.npy.h5 +case0025.npy.h5 +case0035.npy.h5 diff --git a/PuzzleTuning/SSL_structures/TransUNet_main/lists/lists_Synapse/test_vol.txt b/PuzzleTuning/SSL_structures/TransUNet_main/lists/lists_Synapse/test_vol.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c4abd53044eed5457fd1f7e0cca1c99e7222593 --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransUNet_main/lists/lists_Synapse/test_vol.txt @@ -0,0 +1,12 @@ +case0008 +case0022 +case0038 +case0036 +case0032 +case0002 +case0029 +case0003 +case0001 +case0004 +case0025 +case0035 diff --git a/PuzzleTuning/SSL_structures/TransUNet_main/lists/lists_Synapse/train.txt b/PuzzleTuning/SSL_structures/TransUNet_main/lists/lists_Synapse/train.txt new file mode 100644 index 0000000000000000000000000000000000000000..e58616844994a95407d1f664b79cd4e4533d41b8 --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransUNet_main/lists/lists_Synapse/train.txt @@ -0,0 +1,2211 @@ +case0031_slice000 +case0031_slice001 +case0031_slice002 +case0031_slice003 +case0031_slice004 +case0031_slice005 +case0031_slice006 +case0031_slice007 +case0031_slice008 +case0031_slice009 +case0031_slice010 +case0031_slice011 +case0031_slice012 +case0031_slice013 +case0031_slice014 +case0031_slice015 +case0031_slice016 +case0031_slice017 +case0031_slice018 +case0031_slice019 +case0031_slice020 +case0031_slice021 +case0031_slice022 +case0031_slice023 +case0031_slice024 +case0031_slice025 +case0031_slice026 +case0031_slice027 +case0031_slice028 +case0031_slice029 +case0031_slice030 +case0031_slice031 +case0031_slice032 +case0031_slice033 +case0031_slice034 +case0031_slice035 +case0031_slice036 +case0031_slice037 +case0031_slice038 +case0031_slice039 +case0031_slice040 +case0031_slice041 +case0031_slice042 +case0031_slice043 +case0031_slice044 +case0031_slice045 +case0031_slice046 +case0031_slice047 +case0031_slice048 +case0031_slice049 +case0031_slice050 +case0031_slice051 +case0031_slice052 +case0031_slice053 +case0031_slice054 +case0031_slice055 +case0031_slice056 +case0031_slice057 +case0031_slice058 +case0031_slice059 +case0031_slice060 +case0031_slice061 +case0031_slice062 +case0031_slice063 +case0031_slice064 +case0031_slice065 +case0031_slice066 +case0031_slice067 +case0031_slice068 +case0031_slice069 +case0031_slice070 +case0031_slice071 +case0031_slice072 +case0031_slice073 +case0031_slice074 +case0031_slice075 +case0031_slice076 +case0031_slice077 +case0031_slice078 +case0031_slice079 +case0031_slice080 +case0031_slice081 +case0031_slice082 +case0031_slice083 +case0031_slice084 +case0031_slice085 +case0031_slice086 +case0031_slice087 +case0031_slice088 +case0031_slice089 +case0031_slice090 +case0031_slice091 +case0031_slice092 +case0007_slice000 +case0007_slice001 +case0007_slice002 +case0007_slice003 +case0007_slice004 +case0007_slice005 +case0007_slice006 +case0007_slice007 +case0007_slice008 +case0007_slice009 +case0007_slice010 +case0007_slice011 +case0007_slice012 +case0007_slice013 +case0007_slice014 +case0007_slice015 +case0007_slice016 +case0007_slice017 +case0007_slice018 +case0007_slice019 +case0007_slice020 +case0007_slice021 +case0007_slice022 +case0007_slice023 +case0007_slice024 +case0007_slice025 +case0007_slice026 +case0007_slice027 +case0007_slice028 +case0007_slice029 +case0007_slice030 +case0007_slice031 +case0007_slice032 +case0007_slice033 +case0007_slice034 +case0007_slice035 +case0007_slice036 +case0007_slice037 +case0007_slice038 +case0007_slice039 +case0007_slice040 +case0007_slice041 +case0007_slice042 +case0007_slice043 +case0007_slice044 +case0007_slice045 +case0007_slice046 +case0007_slice047 +case0007_slice048 +case0007_slice049 +case0007_slice050 +case0007_slice051 +case0007_slice052 +case0007_slice053 +case0007_slice054 +case0007_slice055 +case0007_slice056 +case0007_slice057 +case0007_slice058 +case0007_slice059 +case0007_slice060 +case0007_slice061 +case0007_slice062 +case0007_slice063 +case0007_slice064 +case0007_slice065 +case0007_slice066 +case0007_slice067 +case0007_slice068 +case0007_slice069 +case0007_slice070 +case0007_slice071 +case0007_slice072 +case0007_slice073 +case0007_slice074 +case0007_slice075 +case0007_slice076 +case0007_slice077 +case0007_slice078 +case0007_slice079 +case0007_slice080 +case0007_slice081 +case0007_slice082 +case0007_slice083 +case0007_slice084 +case0007_slice085 +case0007_slice086 +case0007_slice087 +case0007_slice088 +case0007_slice089 +case0007_slice090 +case0007_slice091 +case0007_slice092 +case0007_slice093 +case0007_slice094 +case0007_slice095 +case0007_slice096 +case0007_slice097 +case0007_slice098 +case0007_slice099 +case0007_slice100 +case0007_slice101 +case0007_slice102 +case0007_slice103 +case0007_slice104 +case0007_slice105 +case0007_slice106 +case0007_slice107 +case0007_slice108 +case0007_slice109 +case0007_slice110 +case0007_slice111 +case0007_slice112 +case0007_slice113 +case0007_slice114 +case0007_slice115 +case0007_slice116 +case0007_slice117 +case0007_slice118 +case0007_slice119 +case0007_slice120 +case0007_slice121 +case0007_slice122 +case0007_slice123 +case0007_slice124 +case0007_slice125 +case0007_slice126 +case0007_slice127 +case0007_slice128 +case0007_slice129 +case0007_slice130 +case0007_slice131 +case0007_slice132 +case0007_slice133 +case0007_slice134 +case0007_slice135 +case0007_slice136 +case0007_slice137 +case0007_slice138 +case0007_slice139 +case0007_slice140 +case0007_slice141 +case0007_slice142 +case0007_slice143 +case0007_slice144 +case0007_slice145 +case0007_slice146 +case0007_slice147 +case0007_slice148 +case0007_slice149 +case0007_slice150 +case0007_slice151 +case0007_slice152 +case0007_slice153 +case0007_slice154 +case0007_slice155 +case0007_slice156 +case0007_slice157 +case0007_slice158 +case0007_slice159 +case0007_slice160 +case0007_slice161 +case0007_slice162 +case0009_slice000 +case0009_slice001 +case0009_slice002 +case0009_slice003 +case0009_slice004 +case0009_slice005 +case0009_slice006 +case0009_slice007 +case0009_slice008 +case0009_slice009 +case0009_slice010 +case0009_slice011 +case0009_slice012 +case0009_slice013 +case0009_slice014 +case0009_slice015 +case0009_slice016 +case0009_slice017 +case0009_slice018 +case0009_slice019 +case0009_slice020 +case0009_slice021 +case0009_slice022 +case0009_slice023 +case0009_slice024 +case0009_slice025 +case0009_slice026 +case0009_slice027 +case0009_slice028 +case0009_slice029 +case0009_slice030 +case0009_slice031 +case0009_slice032 +case0009_slice033 +case0009_slice034 +case0009_slice035 +case0009_slice036 +case0009_slice037 +case0009_slice038 +case0009_slice039 +case0009_slice040 +case0009_slice041 +case0009_slice042 +case0009_slice043 +case0009_slice044 +case0009_slice045 +case0009_slice046 +case0009_slice047 +case0009_slice048 +case0009_slice049 +case0009_slice050 +case0009_slice051 +case0009_slice052 +case0009_slice053 +case0009_slice054 +case0009_slice055 +case0009_slice056 +case0009_slice057 +case0009_slice058 +case0009_slice059 +case0009_slice060 +case0009_slice061 +case0009_slice062 +case0009_slice063 +case0009_slice064 +case0009_slice065 +case0009_slice066 +case0009_slice067 +case0009_slice068 +case0009_slice069 +case0009_slice070 +case0009_slice071 +case0009_slice072 +case0009_slice073 +case0009_slice074 +case0009_slice075 +case0009_slice076 +case0009_slice077 +case0009_slice078 +case0009_slice079 +case0009_slice080 +case0009_slice081 +case0009_slice082 +case0009_slice083 +case0009_slice084 +case0009_slice085 +case0009_slice086 +case0009_slice087 +case0009_slice088 +case0009_slice089 +case0009_slice090 +case0009_slice091 +case0009_slice092 +case0009_slice093 +case0009_slice094 +case0009_slice095 +case0009_slice096 +case0009_slice097 +case0009_slice098 +case0009_slice099 +case0009_slice100 +case0009_slice101 +case0009_slice102 +case0009_slice103 +case0009_slice104 +case0009_slice105 +case0009_slice106 +case0009_slice107 +case0009_slice108 +case0009_slice109 +case0009_slice110 +case0009_slice111 +case0009_slice112 +case0009_slice113 +case0009_slice114 +case0009_slice115 +case0009_slice116 +case0009_slice117 +case0009_slice118 +case0009_slice119 +case0009_slice120 +case0009_slice121 +case0009_slice122 +case0009_slice123 +case0009_slice124 +case0009_slice125 +case0009_slice126 +case0009_slice127 +case0009_slice128 +case0009_slice129 +case0009_slice130 +case0009_slice131 +case0009_slice132 +case0009_slice133 +case0009_slice134 +case0009_slice135 +case0009_slice136 +case0009_slice137 +case0009_slice138 +case0009_slice139 +case0009_slice140 +case0009_slice141 +case0009_slice142 +case0009_slice143 +case0009_slice144 +case0009_slice145 +case0009_slice146 +case0009_slice147 +case0009_slice148 +case0005_slice000 +case0005_slice001 +case0005_slice002 +case0005_slice003 +case0005_slice004 +case0005_slice005 +case0005_slice006 +case0005_slice007 +case0005_slice008 +case0005_slice009 +case0005_slice010 +case0005_slice011 +case0005_slice012 +case0005_slice013 +case0005_slice014 +case0005_slice015 +case0005_slice016 +case0005_slice017 +case0005_slice018 +case0005_slice019 +case0005_slice020 +case0005_slice021 +case0005_slice022 +case0005_slice023 +case0005_slice024 +case0005_slice025 +case0005_slice026 +case0005_slice027 +case0005_slice028 +case0005_slice029 +case0005_slice030 +case0005_slice031 +case0005_slice032 +case0005_slice033 +case0005_slice034 +case0005_slice035 +case0005_slice036 +case0005_slice037 +case0005_slice038 +case0005_slice039 +case0005_slice040 +case0005_slice041 +case0005_slice042 +case0005_slice043 +case0005_slice044 +case0005_slice045 +case0005_slice046 +case0005_slice047 +case0005_slice048 +case0005_slice049 +case0005_slice050 +case0005_slice051 +case0005_slice052 +case0005_slice053 +case0005_slice054 +case0005_slice055 +case0005_slice056 +case0005_slice057 +case0005_slice058 +case0005_slice059 +case0005_slice060 +case0005_slice061 +case0005_slice062 +case0005_slice063 +case0005_slice064 +case0005_slice065 +case0005_slice066 +case0005_slice067 +case0005_slice068 +case0005_slice069 +case0005_slice070 +case0005_slice071 +case0005_slice072 +case0005_slice073 +case0005_slice074 +case0005_slice075 +case0005_slice076 +case0005_slice077 +case0005_slice078 +case0005_slice079 +case0005_slice080 +case0005_slice081 +case0005_slice082 +case0005_slice083 +case0005_slice084 +case0005_slice085 +case0005_slice086 +case0005_slice087 +case0005_slice088 +case0005_slice089 +case0005_slice090 +case0005_slice091 +case0005_slice092 +case0005_slice093 +case0005_slice094 +case0005_slice095 +case0005_slice096 +case0005_slice097 +case0005_slice098 +case0005_slice099 +case0005_slice100 +case0005_slice101 +case0005_slice102 +case0005_slice103 +case0005_slice104 +case0005_slice105 +case0005_slice106 +case0005_slice107 +case0005_slice108 +case0005_slice109 +case0005_slice110 +case0005_slice111 +case0005_slice112 +case0005_slice113 +case0005_slice114 +case0005_slice115 +case0005_slice116 +case0026_slice000 +case0026_slice001 +case0026_slice002 +case0026_slice003 +case0026_slice004 +case0026_slice005 +case0026_slice006 +case0026_slice007 +case0026_slice008 +case0026_slice009 +case0026_slice010 +case0026_slice011 +case0026_slice012 +case0026_slice013 +case0026_slice014 +case0026_slice015 +case0026_slice016 +case0026_slice017 +case0026_slice018 +case0026_slice019 +case0026_slice020 +case0026_slice021 +case0026_slice022 +case0026_slice023 +case0026_slice024 +case0026_slice025 +case0026_slice026 +case0026_slice027 +case0026_slice028 +case0026_slice029 +case0026_slice030 +case0026_slice031 +case0026_slice032 +case0026_slice033 +case0026_slice034 +case0026_slice035 +case0026_slice036 +case0026_slice037 +case0026_slice038 +case0026_slice039 +case0026_slice040 +case0026_slice041 +case0026_slice042 +case0026_slice043 +case0026_slice044 +case0026_slice045 +case0026_slice046 +case0026_slice047 +case0026_slice048 +case0026_slice049 +case0026_slice050 +case0026_slice051 +case0026_slice052 +case0026_slice053 +case0026_slice054 +case0026_slice055 +case0026_slice056 +case0026_slice057 +case0026_slice058 +case0026_slice059 +case0026_slice060 +case0026_slice061 +case0026_slice062 +case0026_slice063 +case0026_slice064 +case0026_slice065 +case0026_slice066 +case0026_slice067 +case0026_slice068 +case0026_slice069 +case0026_slice070 +case0026_slice071 +case0026_slice072 +case0026_slice073 +case0026_slice074 +case0026_slice075 +case0026_slice076 +case0026_slice077 +case0026_slice078 +case0026_slice079 +case0026_slice080 +case0026_slice081 +case0026_slice082 +case0026_slice083 +case0026_slice084 +case0026_slice085 +case0026_slice086 +case0026_slice087 +case0026_slice088 +case0026_slice089 +case0026_slice090 +case0026_slice091 +case0026_slice092 +case0026_slice093 +case0026_slice094 +case0026_slice095 +case0026_slice096 +case0026_slice097 +case0026_slice098 +case0026_slice099 +case0026_slice100 +case0026_slice101 +case0026_slice102 +case0026_slice103 +case0026_slice104 +case0026_slice105 +case0026_slice106 +case0026_slice107 +case0026_slice108 +case0026_slice109 +case0026_slice110 +case0026_slice111 +case0026_slice112 +case0026_slice113 +case0026_slice114 +case0026_slice115 +case0026_slice116 +case0026_slice117 +case0026_slice118 +case0026_slice119 +case0026_slice120 +case0026_slice121 +case0026_slice122 +case0026_slice123 +case0026_slice124 +case0026_slice125 +case0026_slice126 +case0026_slice127 +case0026_slice128 +case0026_slice129 +case0026_slice130 +case0039_slice000 +case0039_slice001 +case0039_slice002 +case0039_slice003 +case0039_slice004 +case0039_slice005 +case0039_slice006 +case0039_slice007 +case0039_slice008 +case0039_slice009 +case0039_slice010 +case0039_slice011 +case0039_slice012 +case0039_slice013 +case0039_slice014 +case0039_slice015 +case0039_slice016 +case0039_slice017 +case0039_slice018 +case0039_slice019 +case0039_slice020 +case0039_slice021 +case0039_slice022 +case0039_slice023 +case0039_slice024 +case0039_slice025 +case0039_slice026 +case0039_slice027 +case0039_slice028 +case0039_slice029 +case0039_slice030 +case0039_slice031 +case0039_slice032 +case0039_slice033 +case0039_slice034 +case0039_slice035 +case0039_slice036 +case0039_slice037 +case0039_slice038 +case0039_slice039 +case0039_slice040 +case0039_slice041 +case0039_slice042 +case0039_slice043 +case0039_slice044 +case0039_slice045 +case0039_slice046 +case0039_slice047 +case0039_slice048 +case0039_slice049 +case0039_slice050 +case0039_slice051 +case0039_slice052 +case0039_slice053 +case0039_slice054 +case0039_slice055 +case0039_slice056 +case0039_slice057 +case0039_slice058 +case0039_slice059 +case0039_slice060 +case0039_slice061 +case0039_slice062 +case0039_slice063 +case0039_slice064 +case0039_slice065 +case0039_slice066 +case0039_slice067 +case0039_slice068 +case0039_slice069 +case0039_slice070 +case0039_slice071 +case0039_slice072 +case0039_slice073 +case0039_slice074 +case0039_slice075 +case0039_slice076 +case0039_slice077 +case0039_slice078 +case0039_slice079 +case0039_slice080 +case0039_slice081 +case0039_slice082 +case0039_slice083 +case0039_slice084 +case0039_slice085 +case0039_slice086 +case0039_slice087 +case0039_slice088 +case0039_slice089 +case0024_slice000 +case0024_slice001 +case0024_slice002 +case0024_slice003 +case0024_slice004 +case0024_slice005 +case0024_slice006 +case0024_slice007 +case0024_slice008 +case0024_slice009 +case0024_slice010 +case0024_slice011 +case0024_slice012 +case0024_slice013 +case0024_slice014 +case0024_slice015 +case0024_slice016 +case0024_slice017 +case0024_slice018 +case0024_slice019 +case0024_slice020 +case0024_slice021 +case0024_slice022 +case0024_slice023 +case0024_slice024 +case0024_slice025 +case0024_slice026 +case0024_slice027 +case0024_slice028 +case0024_slice029 +case0024_slice030 +case0024_slice031 +case0024_slice032 +case0024_slice033 +case0024_slice034 +case0024_slice035 +case0024_slice036 +case0024_slice037 +case0024_slice038 +case0024_slice039 +case0024_slice040 +case0024_slice041 +case0024_slice042 +case0024_slice043 +case0024_slice044 +case0024_slice045 +case0024_slice046 +case0024_slice047 +case0024_slice048 +case0024_slice049 +case0024_slice050 +case0024_slice051 +case0024_slice052 +case0024_slice053 +case0024_slice054 +case0024_slice055 +case0024_slice056 +case0024_slice057 +case0024_slice058 +case0024_slice059 +case0024_slice060 +case0024_slice061 +case0024_slice062 +case0024_slice063 +case0024_slice064 +case0024_slice065 +case0024_slice066 +case0024_slice067 +case0024_slice068 +case0024_slice069 +case0024_slice070 +case0024_slice071 +case0024_slice072 +case0024_slice073 +case0024_slice074 +case0024_slice075 +case0024_slice076 +case0024_slice077 +case0024_slice078 +case0024_slice079 +case0024_slice080 +case0024_slice081 +case0024_slice082 +case0024_slice083 +case0024_slice084 +case0024_slice085 +case0024_slice086 +case0024_slice087 +case0024_slice088 +case0024_slice089 +case0024_slice090 +case0024_slice091 +case0024_slice092 +case0024_slice093 +case0024_slice094 +case0024_slice095 +case0024_slice096 +case0024_slice097 +case0024_slice098 +case0024_slice099 +case0024_slice100 +case0024_slice101 +case0024_slice102 +case0024_slice103 +case0024_slice104 +case0024_slice105 +case0024_slice106 +case0024_slice107 +case0024_slice108 +case0024_slice109 +case0024_slice110 +case0024_slice111 +case0024_slice112 +case0024_slice113 +case0024_slice114 +case0024_slice115 +case0024_slice116 +case0024_slice117 +case0024_slice118 +case0024_slice119 +case0024_slice120 +case0024_slice121 +case0024_slice122 +case0024_slice123 +case0034_slice000 +case0034_slice001 +case0034_slice002 +case0034_slice003 +case0034_slice004 +case0034_slice005 +case0034_slice006 +case0034_slice007 +case0034_slice008 +case0034_slice009 +case0034_slice010 +case0034_slice011 +case0034_slice012 +case0034_slice013 +case0034_slice014 +case0034_slice015 +case0034_slice016 +case0034_slice017 +case0034_slice018 +case0034_slice019 +case0034_slice020 +case0034_slice021 +case0034_slice022 +case0034_slice023 +case0034_slice024 +case0034_slice025 +case0034_slice026 +case0034_slice027 +case0034_slice028 +case0034_slice029 +case0034_slice030 +case0034_slice031 +case0034_slice032 +case0034_slice033 +case0034_slice034 +case0034_slice035 +case0034_slice036 +case0034_slice037 +case0034_slice038 +case0034_slice039 +case0034_slice040 +case0034_slice041 +case0034_slice042 +case0034_slice043 +case0034_slice044 +case0034_slice045 +case0034_slice046 +case0034_slice047 +case0034_slice048 +case0034_slice049 +case0034_slice050 +case0034_slice051 +case0034_slice052 +case0034_slice053 +case0034_slice054 +case0034_slice055 +case0034_slice056 +case0034_slice057 +case0034_slice058 +case0034_slice059 +case0034_slice060 +case0034_slice061 +case0034_slice062 +case0034_slice063 +case0034_slice064 +case0034_slice065 +case0034_slice066 +case0034_slice067 +case0034_slice068 +case0034_slice069 +case0034_slice070 +case0034_slice071 +case0034_slice072 +case0034_slice073 +case0034_slice074 +case0034_slice075 +case0034_slice076 +case0034_slice077 +case0034_slice078 +case0034_slice079 +case0034_slice080 +case0034_slice081 +case0034_slice082 +case0034_slice083 +case0034_slice084 +case0034_slice085 +case0034_slice086 +case0034_slice087 +case0034_slice088 +case0034_slice089 +case0034_slice090 +case0034_slice091 +case0034_slice092 +case0034_slice093 +case0034_slice094 +case0034_slice095 +case0034_slice096 +case0034_slice097 +case0033_slice000 +case0033_slice001 +case0033_slice002 +case0033_slice003 +case0033_slice004 +case0033_slice005 +case0033_slice006 +case0033_slice007 +case0033_slice008 +case0033_slice009 +case0033_slice010 +case0033_slice011 +case0033_slice012 +case0033_slice013 +case0033_slice014 +case0033_slice015 +case0033_slice016 +case0033_slice017 +case0033_slice018 +case0033_slice019 +case0033_slice020 +case0033_slice021 +case0033_slice022 +case0033_slice023 +case0033_slice024 +case0033_slice025 +case0033_slice026 +case0033_slice027 +case0033_slice028 +case0033_slice029 +case0033_slice030 +case0033_slice031 +case0033_slice032 +case0033_slice033 +case0033_slice034 +case0033_slice035 +case0033_slice036 +case0033_slice037 +case0033_slice038 +case0033_slice039 +case0033_slice040 +case0033_slice041 +case0033_slice042 +case0033_slice043 +case0033_slice044 +case0033_slice045 +case0033_slice046 +case0033_slice047 +case0033_slice048 +case0033_slice049 +case0033_slice050 +case0033_slice051 +case0033_slice052 +case0033_slice053 +case0033_slice054 +case0033_slice055 +case0033_slice056 +case0033_slice057 +case0033_slice058 +case0033_slice059 +case0033_slice060 +case0033_slice061 +case0033_slice062 +case0033_slice063 +case0033_slice064 +case0033_slice065 +case0033_slice066 +case0033_slice067 +case0033_slice068 +case0033_slice069 +case0033_slice070 +case0033_slice071 +case0033_slice072 +case0033_slice073 +case0033_slice074 +case0033_slice075 +case0033_slice076 +case0033_slice077 +case0033_slice078 +case0033_slice079 +case0033_slice080 +case0033_slice081 +case0033_slice082 +case0033_slice083 +case0033_slice084 +case0033_slice085 +case0033_slice086 +case0033_slice087 +case0033_slice088 +case0033_slice089 +case0033_slice090 +case0033_slice091 +case0033_slice092 +case0033_slice093 +case0033_slice094 +case0033_slice095 +case0033_slice096 +case0033_slice097 +case0033_slice098 +case0033_slice099 +case0033_slice100 +case0033_slice101 +case0033_slice102 +case0033_slice103 +case0030_slice000 +case0030_slice001 +case0030_slice002 +case0030_slice003 +case0030_slice004 +case0030_slice005 +case0030_slice006 +case0030_slice007 +case0030_slice008 +case0030_slice009 +case0030_slice010 +case0030_slice011 +case0030_slice012 +case0030_slice013 +case0030_slice014 +case0030_slice015 +case0030_slice016 +case0030_slice017 +case0030_slice018 +case0030_slice019 +case0030_slice020 +case0030_slice021 +case0030_slice022 +case0030_slice023 +case0030_slice024 +case0030_slice025 +case0030_slice026 +case0030_slice027 +case0030_slice028 +case0030_slice029 +case0030_slice030 +case0030_slice031 +case0030_slice032 +case0030_slice033 +case0030_slice034 +case0030_slice035 +case0030_slice036 +case0030_slice037 +case0030_slice038 +case0030_slice039 +case0030_slice040 +case0030_slice041 +case0030_slice042 +case0030_slice043 +case0030_slice044 +case0030_slice045 +case0030_slice046 +case0030_slice047 +case0030_slice048 +case0030_slice049 +case0030_slice050 +case0030_slice051 +case0030_slice052 +case0030_slice053 +case0030_slice054 +case0030_slice055 +case0030_slice056 +case0030_slice057 +case0030_slice058 +case0030_slice059 +case0030_slice060 +case0030_slice061 +case0030_slice062 +case0030_slice063 +case0030_slice064 +case0030_slice065 +case0030_slice066 +case0030_slice067 +case0030_slice068 +case0030_slice069 +case0030_slice070 +case0030_slice071 +case0030_slice072 +case0030_slice073 +case0030_slice074 +case0030_slice075 +case0030_slice076 +case0030_slice077 +case0030_slice078 +case0030_slice079 +case0030_slice080 +case0030_slice081 +case0030_slice082 +case0030_slice083 +case0030_slice084 +case0030_slice085 +case0030_slice086 +case0030_slice087 +case0030_slice088 +case0030_slice089 +case0030_slice090 +case0030_slice091 +case0030_slice092 +case0030_slice093 +case0030_slice094 +case0030_slice095 +case0030_slice096 +case0030_slice097 +case0030_slice098 +case0030_slice099 +case0030_slice100 +case0030_slice101 +case0030_slice102 +case0030_slice103 +case0030_slice104 +case0030_slice105 +case0030_slice106 +case0030_slice107 +case0030_slice108 +case0030_slice109 +case0030_slice110 +case0030_slice111 +case0030_slice112 +case0030_slice113 +case0030_slice114 +case0030_slice115 +case0030_slice116 +case0030_slice117 +case0030_slice118 +case0030_slice119 +case0030_slice120 +case0030_slice121 +case0030_slice122 +case0030_slice123 +case0030_slice124 +case0030_slice125 +case0030_slice126 +case0030_slice127 +case0030_slice128 +case0030_slice129 +case0030_slice130 +case0030_slice131 +case0030_slice132 +case0030_slice133 +case0030_slice134 +case0030_slice135 +case0030_slice136 +case0030_slice137 +case0030_slice138 +case0030_slice139 +case0030_slice140 +case0030_slice141 +case0030_slice142 +case0030_slice143 +case0030_slice144 +case0030_slice145 +case0030_slice146 +case0030_slice147 +case0030_slice148 +case0030_slice149 +case0030_slice150 +case0030_slice151 +case0030_slice152 +case0023_slice000 +case0023_slice001 +case0023_slice002 +case0023_slice003 +case0023_slice004 +case0023_slice005 +case0023_slice006 +case0023_slice007 +case0023_slice008 +case0023_slice009 +case0023_slice010 +case0023_slice011 +case0023_slice012 +case0023_slice013 +case0023_slice014 +case0023_slice015 +case0023_slice016 +case0023_slice017 +case0023_slice018 +case0023_slice019 +case0023_slice020 +case0023_slice021 +case0023_slice022 +case0023_slice023 +case0023_slice024 +case0023_slice025 +case0023_slice026 +case0023_slice027 +case0023_slice028 +case0023_slice029 +case0023_slice030 +case0023_slice031 +case0023_slice032 +case0023_slice033 +case0023_slice034 +case0023_slice035 +case0023_slice036 +case0023_slice037 +case0023_slice038 +case0023_slice039 +case0023_slice040 +case0023_slice041 +case0023_slice042 +case0023_slice043 +case0023_slice044 +case0023_slice045 +case0023_slice046 +case0023_slice047 +case0023_slice048 +case0023_slice049 +case0023_slice050 +case0023_slice051 +case0023_slice052 +case0023_slice053 +case0023_slice054 +case0023_slice055 +case0023_slice056 +case0023_slice057 +case0023_slice058 +case0023_slice059 +case0023_slice060 +case0023_slice061 +case0023_slice062 +case0023_slice063 +case0023_slice064 +case0023_slice065 +case0023_slice066 +case0023_slice067 +case0023_slice068 +case0023_slice069 +case0023_slice070 +case0023_slice071 +case0023_slice072 +case0023_slice073 +case0023_slice074 +case0023_slice075 +case0023_slice076 +case0023_slice077 +case0023_slice078 +case0023_slice079 +case0023_slice080 +case0023_slice081 +case0023_slice082 +case0023_slice083 +case0023_slice084 +case0023_slice085 +case0023_slice086 +case0023_slice087 +case0023_slice088 +case0023_slice089 +case0023_slice090 +case0023_slice091 +case0023_slice092 +case0023_slice093 +case0023_slice094 +case0023_slice095 +case0040_slice000 +case0040_slice001 +case0040_slice002 +case0040_slice003 +case0040_slice004 +case0040_slice005 +case0040_slice006 +case0040_slice007 +case0040_slice008 +case0040_slice009 +case0040_slice010 +case0040_slice011 +case0040_slice012 +case0040_slice013 +case0040_slice014 +case0040_slice015 +case0040_slice016 +case0040_slice017 +case0040_slice018 +case0040_slice019 +case0040_slice020 +case0040_slice021 +case0040_slice022 +case0040_slice023 +case0040_slice024 +case0040_slice025 +case0040_slice026 +case0040_slice027 +case0040_slice028 +case0040_slice029 +case0040_slice030 +case0040_slice031 +case0040_slice032 +case0040_slice033 +case0040_slice034 +case0040_slice035 +case0040_slice036 +case0040_slice037 +case0040_slice038 +case0040_slice039 +case0040_slice040 +case0040_slice041 +case0040_slice042 +case0040_slice043 +case0040_slice044 +case0040_slice045 +case0040_slice046 +case0040_slice047 +case0040_slice048 +case0040_slice049 +case0040_slice050 +case0040_slice051 +case0040_slice052 +case0040_slice053 +case0040_slice054 +case0040_slice055 +case0040_slice056 +case0040_slice057 +case0040_slice058 +case0040_slice059 +case0040_slice060 +case0040_slice061 +case0040_slice062 +case0040_slice063 +case0040_slice064 +case0040_slice065 +case0040_slice066 +case0040_slice067 +case0040_slice068 +case0040_slice069 +case0040_slice070 +case0040_slice071 +case0040_slice072 +case0040_slice073 +case0040_slice074 +case0040_slice075 +case0040_slice076 +case0040_slice077 +case0040_slice078 +case0040_slice079 +case0040_slice080 +case0040_slice081 +case0040_slice082 +case0040_slice083 +case0040_slice084 +case0040_slice085 +case0040_slice086 +case0040_slice087 +case0040_slice088 +case0040_slice089 +case0040_slice090 +case0040_slice091 +case0040_slice092 +case0040_slice093 +case0040_slice094 +case0040_slice095 +case0040_slice096 +case0040_slice097 +case0040_slice098 +case0040_slice099 +case0040_slice100 +case0040_slice101 +case0040_slice102 +case0040_slice103 +case0040_slice104 +case0040_slice105 +case0040_slice106 +case0040_slice107 +case0040_slice108 +case0040_slice109 +case0040_slice110 +case0040_slice111 +case0040_slice112 +case0040_slice113 +case0040_slice114 +case0040_slice115 +case0040_slice116 +case0040_slice117 +case0040_slice118 +case0040_slice119 +case0040_slice120 +case0040_slice121 +case0040_slice122 +case0040_slice123 +case0040_slice124 +case0040_slice125 +case0040_slice126 +case0040_slice127 +case0040_slice128 +case0040_slice129 +case0040_slice130 +case0040_slice131 +case0040_slice132 +case0040_slice133 +case0040_slice134 +case0040_slice135 +case0040_slice136 +case0040_slice137 +case0040_slice138 +case0040_slice139 +case0040_slice140 +case0040_slice141 +case0040_slice142 +case0040_slice143 +case0040_slice144 +case0040_slice145 +case0040_slice146 +case0040_slice147 +case0040_slice148 +case0040_slice149 +case0040_slice150 +case0040_slice151 +case0040_slice152 +case0040_slice153 +case0040_slice154 +case0040_slice155 +case0040_slice156 +case0040_slice157 +case0040_slice158 +case0040_slice159 +case0040_slice160 +case0040_slice161 +case0040_slice162 +case0040_slice163 +case0040_slice164 +case0040_slice165 +case0040_slice166 +case0040_slice167 +case0040_slice168 +case0040_slice169 +case0040_slice170 +case0040_slice171 +case0040_slice172 +case0040_slice173 +case0040_slice174 +case0040_slice175 +case0040_slice176 +case0040_slice177 +case0040_slice178 +case0040_slice179 +case0040_slice180 +case0040_slice181 +case0040_slice182 +case0040_slice183 +case0040_slice184 +case0040_slice185 +case0040_slice186 +case0040_slice187 +case0040_slice188 +case0040_slice189 +case0040_slice190 +case0040_slice191 +case0040_slice192 +case0040_slice193 +case0040_slice194 +case0010_slice000 +case0010_slice001 +case0010_slice002 +case0010_slice003 +case0010_slice004 +case0010_slice005 +case0010_slice006 +case0010_slice007 +case0010_slice008 +case0010_slice009 +case0010_slice010 +case0010_slice011 +case0010_slice012 +case0010_slice013 +case0010_slice014 +case0010_slice015 +case0010_slice016 +case0010_slice017 +case0010_slice018 +case0010_slice019 +case0010_slice020 +case0010_slice021 +case0010_slice022 +case0010_slice023 +case0010_slice024 +case0010_slice025 +case0010_slice026 +case0010_slice027 +case0010_slice028 +case0010_slice029 +case0010_slice030 +case0010_slice031 +case0010_slice032 +case0010_slice033 +case0010_slice034 +case0010_slice035 +case0010_slice036 +case0010_slice037 +case0010_slice038 +case0010_slice039 +case0010_slice040 +case0010_slice041 +case0010_slice042 +case0010_slice043 +case0010_slice044 +case0010_slice045 +case0010_slice046 +case0010_slice047 +case0010_slice048 +case0010_slice049 +case0010_slice050 +case0010_slice051 +case0010_slice052 +case0010_slice053 +case0010_slice054 +case0010_slice055 +case0010_slice056 +case0010_slice057 +case0010_slice058 +case0010_slice059 +case0010_slice060 +case0010_slice061 +case0010_slice062 +case0010_slice063 +case0010_slice064 +case0010_slice065 +case0010_slice066 +case0010_slice067 +case0010_slice068 +case0010_slice069 +case0010_slice070 +case0010_slice071 +case0010_slice072 +case0010_slice073 +case0010_slice074 +case0010_slice075 +case0010_slice076 +case0010_slice077 +case0010_slice078 +case0010_slice079 +case0010_slice080 +case0010_slice081 +case0010_slice082 +case0010_slice083 +case0010_slice084 +case0010_slice085 +case0010_slice086 +case0010_slice087 +case0010_slice088 +case0010_slice089 +case0010_slice090 +case0010_slice091 +case0010_slice092 +case0010_slice093 +case0010_slice094 +case0010_slice095 +case0010_slice096 +case0010_slice097 +case0010_slice098 +case0010_slice099 +case0010_slice100 +case0010_slice101 +case0010_slice102 +case0010_slice103 +case0010_slice104 +case0010_slice105 +case0010_slice106 +case0010_slice107 +case0010_slice108 +case0010_slice109 +case0010_slice110 +case0010_slice111 +case0010_slice112 +case0010_slice113 +case0010_slice114 +case0010_slice115 +case0010_slice116 +case0010_slice117 +case0010_slice118 +case0010_slice119 +case0010_slice120 +case0010_slice121 +case0010_slice122 +case0010_slice123 +case0010_slice124 +case0010_slice125 +case0010_slice126 +case0010_slice127 +case0010_slice128 +case0010_slice129 +case0010_slice130 +case0010_slice131 +case0010_slice132 +case0010_slice133 +case0010_slice134 +case0010_slice135 +case0010_slice136 +case0010_slice137 +case0010_slice138 +case0010_slice139 +case0010_slice140 +case0010_slice141 +case0010_slice142 +case0010_slice143 +case0010_slice144 +case0010_slice145 +case0010_slice146 +case0010_slice147 +case0021_slice000 +case0021_slice001 +case0021_slice002 +case0021_slice003 +case0021_slice004 +case0021_slice005 +case0021_slice006 +case0021_slice007 +case0021_slice008 +case0021_slice009 +case0021_slice010 +case0021_slice011 +case0021_slice012 +case0021_slice013 +case0021_slice014 +case0021_slice015 +case0021_slice016 +case0021_slice017 +case0021_slice018 +case0021_slice019 +case0021_slice020 +case0021_slice021 +case0021_slice022 +case0021_slice023 +case0021_slice024 +case0021_slice025 +case0021_slice026 +case0021_slice027 +case0021_slice028 +case0021_slice029 +case0021_slice030 +case0021_slice031 +case0021_slice032 +case0021_slice033 +case0021_slice034 +case0021_slice035 +case0021_slice036 +case0021_slice037 +case0021_slice038 +case0021_slice039 +case0021_slice040 +case0021_slice041 +case0021_slice042 +case0021_slice043 +case0021_slice044 +case0021_slice045 +case0021_slice046 +case0021_slice047 +case0021_slice048 +case0021_slice049 +case0021_slice050 +case0021_slice051 +case0021_slice052 +case0021_slice053 +case0021_slice054 +case0021_slice055 +case0021_slice056 +case0021_slice057 +case0021_slice058 +case0021_slice059 +case0021_slice060 +case0021_slice061 +case0021_slice062 +case0021_slice063 +case0021_slice064 +case0021_slice065 +case0021_slice066 +case0021_slice067 +case0021_slice068 +case0021_slice069 +case0021_slice070 +case0021_slice071 +case0021_slice072 +case0021_slice073 +case0021_slice074 +case0021_slice075 +case0021_slice076 +case0021_slice077 +case0021_slice078 +case0021_slice079 +case0021_slice080 +case0021_slice081 +case0021_slice082 +case0021_slice083 +case0021_slice084 +case0021_slice085 +case0021_slice086 +case0021_slice087 +case0021_slice088 +case0021_slice089 +case0021_slice090 +case0021_slice091 +case0021_slice092 +case0021_slice093 +case0021_slice094 +case0021_slice095 +case0021_slice096 +case0021_slice097 +case0021_slice098 +case0021_slice099 +case0021_slice100 +case0021_slice101 +case0021_slice102 +case0021_slice103 +case0021_slice104 +case0021_slice105 +case0021_slice106 +case0021_slice107 +case0021_slice108 +case0021_slice109 +case0021_slice110 +case0021_slice111 +case0021_slice112 +case0021_slice113 +case0021_slice114 +case0021_slice115 +case0021_slice116 +case0021_slice117 +case0021_slice118 +case0021_slice119 +case0021_slice120 +case0021_slice121 +case0021_slice122 +case0021_slice123 +case0021_slice124 +case0021_slice125 +case0021_slice126 +case0021_slice127 +case0021_slice128 +case0021_slice129 +case0021_slice130 +case0021_slice131 +case0021_slice132 +case0021_slice133 +case0021_slice134 +case0021_slice135 +case0021_slice136 +case0021_slice137 +case0021_slice138 +case0021_slice139 +case0021_slice140 +case0021_slice141 +case0021_slice142 +case0006_slice000 +case0006_slice001 +case0006_slice002 +case0006_slice003 +case0006_slice004 +case0006_slice005 +case0006_slice006 +case0006_slice007 +case0006_slice008 +case0006_slice009 +case0006_slice010 +case0006_slice011 +case0006_slice012 +case0006_slice013 +case0006_slice014 +case0006_slice015 +case0006_slice016 +case0006_slice017 +case0006_slice018 +case0006_slice019 +case0006_slice020 +case0006_slice021 +case0006_slice022 +case0006_slice023 +case0006_slice024 +case0006_slice025 +case0006_slice026 +case0006_slice027 +case0006_slice028 +case0006_slice029 +case0006_slice030 +case0006_slice031 +case0006_slice032 +case0006_slice033 +case0006_slice034 +case0006_slice035 +case0006_slice036 +case0006_slice037 +case0006_slice038 +case0006_slice039 +case0006_slice040 +case0006_slice041 +case0006_slice042 +case0006_slice043 +case0006_slice044 +case0006_slice045 +case0006_slice046 +case0006_slice047 +case0006_slice048 +case0006_slice049 +case0006_slice050 +case0006_slice051 +case0006_slice052 +case0006_slice053 +case0006_slice054 +case0006_slice055 +case0006_slice056 +case0006_slice057 +case0006_slice058 +case0006_slice059 +case0006_slice060 +case0006_slice061 +case0006_slice062 +case0006_slice063 +case0006_slice064 +case0006_slice065 +case0006_slice066 +case0006_slice067 +case0006_slice068 +case0006_slice069 +case0006_slice070 +case0006_slice071 +case0006_slice072 +case0006_slice073 +case0006_slice074 +case0006_slice075 +case0006_slice076 +case0006_slice077 +case0006_slice078 +case0006_slice079 +case0006_slice080 +case0006_slice081 +case0006_slice082 +case0006_slice083 +case0006_slice084 +case0006_slice085 +case0006_slice086 +case0006_slice087 +case0006_slice088 +case0006_slice089 +case0006_slice090 +case0006_slice091 +case0006_slice092 +case0006_slice093 +case0006_slice094 +case0006_slice095 +case0006_slice096 +case0006_slice097 +case0006_slice098 +case0006_slice099 +case0006_slice100 +case0006_slice101 +case0006_slice102 +case0006_slice103 +case0006_slice104 +case0006_slice105 +case0006_slice106 +case0006_slice107 +case0006_slice108 +case0006_slice109 +case0006_slice110 +case0006_slice111 +case0006_slice112 +case0006_slice113 +case0006_slice114 +case0006_slice115 +case0006_slice116 +case0006_slice117 +case0006_slice118 +case0006_slice119 +case0006_slice120 +case0006_slice121 +case0006_slice122 +case0006_slice123 +case0006_slice124 +case0006_slice125 +case0006_slice126 +case0006_slice127 +case0006_slice128 +case0006_slice129 +case0006_slice130 +case0027_slice000 +case0027_slice001 +case0027_slice002 +case0027_slice003 +case0027_slice004 +case0027_slice005 +case0027_slice006 +case0027_slice007 +case0027_slice008 +case0027_slice009 +case0027_slice010 +case0027_slice011 +case0027_slice012 +case0027_slice013 +case0027_slice014 +case0027_slice015 +case0027_slice016 +case0027_slice017 +case0027_slice018 +case0027_slice019 +case0027_slice020 +case0027_slice021 +case0027_slice022 +case0027_slice023 +case0027_slice024 +case0027_slice025 +case0027_slice026 +case0027_slice027 +case0027_slice028 +case0027_slice029 +case0027_slice030 +case0027_slice031 +case0027_slice032 +case0027_slice033 +case0027_slice034 +case0027_slice035 +case0027_slice036 +case0027_slice037 +case0027_slice038 +case0027_slice039 +case0027_slice040 +case0027_slice041 +case0027_slice042 +case0027_slice043 +case0027_slice044 +case0027_slice045 +case0027_slice046 +case0027_slice047 +case0027_slice048 +case0027_slice049 +case0027_slice050 +case0027_slice051 +case0027_slice052 +case0027_slice053 +case0027_slice054 +case0027_slice055 +case0027_slice056 +case0027_slice057 +case0027_slice058 +case0027_slice059 +case0027_slice060 +case0027_slice061 +case0027_slice062 +case0027_slice063 +case0027_slice064 +case0027_slice065 +case0027_slice066 +case0027_slice067 +case0027_slice068 +case0027_slice069 +case0027_slice070 +case0027_slice071 +case0027_slice072 +case0027_slice073 +case0027_slice074 +case0027_slice075 +case0027_slice076 +case0027_slice077 +case0027_slice078 +case0027_slice079 +case0027_slice080 +case0027_slice081 +case0027_slice082 +case0027_slice083 +case0027_slice084 +case0027_slice085 +case0027_slice086 +case0027_slice087 +case0028_slice000 +case0028_slice001 +case0028_slice002 +case0028_slice003 +case0028_slice004 +case0028_slice005 +case0028_slice006 +case0028_slice007 +case0028_slice008 +case0028_slice009 +case0028_slice010 +case0028_slice011 +case0028_slice012 +case0028_slice013 +case0028_slice014 +case0028_slice015 +case0028_slice016 +case0028_slice017 +case0028_slice018 +case0028_slice019 +case0028_slice020 +case0028_slice021 +case0028_slice022 +case0028_slice023 +case0028_slice024 +case0028_slice025 +case0028_slice026 +case0028_slice027 +case0028_slice028 +case0028_slice029 +case0028_slice030 +case0028_slice031 +case0028_slice032 +case0028_slice033 +case0028_slice034 +case0028_slice035 +case0028_slice036 +case0028_slice037 +case0028_slice038 +case0028_slice039 +case0028_slice040 +case0028_slice041 +case0028_slice042 +case0028_slice043 +case0028_slice044 +case0028_slice045 +case0028_slice046 +case0028_slice047 +case0028_slice048 +case0028_slice049 +case0028_slice050 +case0028_slice051 +case0028_slice052 +case0028_slice053 +case0028_slice054 +case0028_slice055 +case0028_slice056 +case0028_slice057 +case0028_slice058 +case0028_slice059 +case0028_slice060 +case0028_slice061 +case0028_slice062 +case0028_slice063 +case0028_slice064 +case0028_slice065 +case0028_slice066 +case0028_slice067 +case0028_slice068 +case0028_slice069 +case0028_slice070 +case0028_slice071 +case0028_slice072 +case0028_slice073 +case0028_slice074 +case0028_slice075 +case0028_slice076 +case0028_slice077 +case0028_slice078 +case0028_slice079 +case0028_slice080 +case0028_slice081 +case0028_slice082 +case0028_slice083 +case0028_slice084 +case0028_slice085 +case0028_slice086 +case0028_slice087 +case0028_slice088 +case0037_slice000 +case0037_slice001 +case0037_slice002 +case0037_slice003 +case0037_slice004 +case0037_slice005 +case0037_slice006 +case0037_slice007 +case0037_slice008 +case0037_slice009 +case0037_slice010 +case0037_slice011 +case0037_slice012 +case0037_slice013 +case0037_slice014 +case0037_slice015 +case0037_slice016 +case0037_slice017 +case0037_slice018 +case0037_slice019 +case0037_slice020 +case0037_slice021 +case0037_slice022 +case0037_slice023 +case0037_slice024 +case0037_slice025 +case0037_slice026 +case0037_slice027 +case0037_slice028 +case0037_slice029 +case0037_slice030 +case0037_slice031 +case0037_slice032 +case0037_slice033 +case0037_slice034 +case0037_slice035 +case0037_slice036 +case0037_slice037 +case0037_slice038 +case0037_slice039 +case0037_slice040 +case0037_slice041 +case0037_slice042 +case0037_slice043 +case0037_slice044 +case0037_slice045 +case0037_slice046 +case0037_slice047 +case0037_slice048 +case0037_slice049 +case0037_slice050 +case0037_slice051 +case0037_slice052 +case0037_slice053 +case0037_slice054 +case0037_slice055 +case0037_slice056 +case0037_slice057 +case0037_slice058 +case0037_slice059 +case0037_slice060 +case0037_slice061 +case0037_slice062 +case0037_slice063 +case0037_slice064 +case0037_slice065 +case0037_slice066 +case0037_slice067 +case0037_slice068 +case0037_slice069 +case0037_slice070 +case0037_slice071 +case0037_slice072 +case0037_slice073 +case0037_slice074 +case0037_slice075 +case0037_slice076 +case0037_slice077 +case0037_slice078 +case0037_slice079 +case0037_slice080 +case0037_slice081 +case0037_slice082 +case0037_slice083 +case0037_slice084 +case0037_slice085 +case0037_slice086 +case0037_slice087 +case0037_slice088 +case0037_slice089 +case0037_slice090 +case0037_slice091 +case0037_slice092 +case0037_slice093 +case0037_slice094 +case0037_slice095 +case0037_slice096 +case0037_slice097 +case0037_slice098 diff --git a/PuzzleTuning/SSL_structures/TransUNet_main/networks/vit_seg_configs.py b/PuzzleTuning/SSL_structures/TransUNet_main/networks/vit_seg_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..1bc4c784cd439720493cb17f333b683ae0494032 --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransUNet_main/networks/vit_seg_configs.py @@ -0,0 +1,130 @@ +import ml_collections + +def get_b16_config(): + """Returns the ViT-B/16 configuration.""" + config = ml_collections.ConfigDict() + config.patches = ml_collections.ConfigDict({'size': (16, 16)}) + config.hidden_size = 768 + config.transformer = ml_collections.ConfigDict() + config.transformer.mlp_dim = 3072 + config.transformer.num_heads = 12 + config.transformer.num_layers = 12 + config.transformer.attention_dropout_rate = 0.0 + config.transformer.dropout_rate = 0.1 + + config.classifier = 'seg' + config.representation_size = None + config.resnet_pretrained_path = None + config.pretrained_path = '../model/vit_checkpoint/imagenet21k/ViT-B_16.npz' + config.patch_size = 16 + + config.decoder_channels = (256, 128, 64, 16) + config.n_classes = 2 + config.activation = 'softmax' + return config + + +def get_testing(): + """Returns a minimal configuration for testing.""" + config = ml_collections.ConfigDict() + config.patches = ml_collections.ConfigDict({'size': (16, 16)}) + config.hidden_size = 1 + config.transformer = ml_collections.ConfigDict() + config.transformer.mlp_dim = 1 + config.transformer.num_heads = 1 + config.transformer.num_layers = 1 + config.transformer.attention_dropout_rate = 0.0 + config.transformer.dropout_rate = 0.1 + config.classifier = 'token' + config.representation_size = None + return config + +def get_r50_b16_config(): + """Returns the Resnet50 + ViT-B/16 configuration.""" + config = get_b16_config() + config.patches.grid = (16, 16) + config.resnet = ml_collections.ConfigDict() + config.resnet.num_layers = (3, 4, 9) + config.resnet.width_factor = 1 + + config.classifier = 'seg' + config.pretrained_path = '../model/vit_checkpoint/imagenet21k/R50+ViT-B_16.npz' + config.decoder_channels = (256, 128, 64, 16) + config.skip_channels = [512, 256, 64, 16] + config.n_classes = 2 + config.n_skip = 3 + config.activation = 'softmax' + + return config + + +def get_b32_config(): + """Returns the ViT-B/32 configuration.""" + config = get_b16_config() + config.patches.size = (32, 32) + config.pretrained_path = '../model/vit_checkpoint/imagenet21k/ViT-B_32.npz' + return config + + +def get_l16_config(): + """Returns the ViT-L/16 configuration.""" + config = ml_collections.ConfigDict() + config.patches = ml_collections.ConfigDict({'size': (16, 16)}) + config.hidden_size = 1024 + config.transformer = ml_collections.ConfigDict() + config.transformer.mlp_dim = 4096 + config.transformer.num_heads = 16 + config.transformer.num_layers = 24 + config.transformer.attention_dropout_rate = 0.0 + config.transformer.dropout_rate = 0.1 + config.representation_size = None + + # custom + config.classifier = 'seg' + config.resnet_pretrained_path = None + config.pretrained_path = '../model/vit_checkpoint/imagenet21k/ViT-L_16.npz' + config.decoder_channels = (256, 128, 64, 16) + config.n_classes = 2 + config.activation = 'softmax' + return config + + +def get_r50_l16_config(): + """Returns the Resnet50 + ViT-L/16 configuration. customized """ + config = get_l16_config() + config.patches.grid = (16, 16) + config.resnet = ml_collections.ConfigDict() + config.resnet.num_layers = (3, 4, 9) + config.resnet.width_factor = 1 + + config.classifier = 'seg' + config.resnet_pretrained_path = '../model/vit_checkpoint/imagenet21k/R50+ViT-B_16.npz' + config.decoder_channels = (256, 128, 64, 16) + config.skip_channels = [512, 256, 64, 16] + config.n_classes = 2 + config.activation = 'softmax' + return config + + +def get_l32_config(): + """Returns the ViT-L/32 configuration.""" + config = get_l16_config() + config.patches.size = (32, 32) + return config + + +def get_h14_config(): + """Returns the ViT-L/16 configuration.""" + config = ml_collections.ConfigDict() + config.patches = ml_collections.ConfigDict({'size': (14, 14)}) + config.hidden_size = 1280 + config.transformer = ml_collections.ConfigDict() + config.transformer.mlp_dim = 5120 + config.transformer.num_heads = 16 + config.transformer.num_layers = 32 + config.transformer.attention_dropout_rate = 0.0 + config.transformer.dropout_rate = 0.1 + config.classifier = 'token' + config.representation_size = None + + return config diff --git a/PuzzleTuning/SSL_structures/TransUNet_main/networks/vit_seg_modeling.py b/PuzzleTuning/SSL_structures/TransUNet_main/networks/vit_seg_modeling.py new file mode 100644 index 0000000000000000000000000000000000000000..8346d9f166fea32cc7007b95b3878687400a7734 --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransUNet_main/networks/vit_seg_modeling.py @@ -0,0 +1,453 @@ +# coding=utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import logging +import math + +from os.path import join as pjoin + +import torch +import torch.nn as nn +import numpy as np + +from torch.nn import CrossEntropyLoss, Dropout, Softmax, Linear, Conv2d, LayerNorm +from torch.nn.modules.utils import _pair +from scipy import ndimage +from . import vit_seg_configs as configs +from .vit_seg_modeling_resnet_skip import ResNetV2 + + +logger = logging.getLogger(__name__) + + +ATTENTION_Q = "MultiHeadDotProductAttention_1/query" +ATTENTION_K = "MultiHeadDotProductAttention_1/key" +ATTENTION_V = "MultiHeadDotProductAttention_1/value" +ATTENTION_OUT = "MultiHeadDotProductAttention_1/out" +FC_0 = "MlpBlock_3/Dense_0" +FC_1 = "MlpBlock_3/Dense_1" +ATTENTION_NORM = "LayerNorm_0" +MLP_NORM = "LayerNorm_2" + + +def np2th(weights, conv=False): + """Possibly convert HWIO to OIHW.""" + if conv: + weights = weights.transpose([3, 2, 0, 1]) + return torch.from_numpy(weights) + + +def swish(x): + return x * torch.sigmoid(x) + + +ACT2FN = {"gelu": torch.nn.functional.gelu, "relu": torch.nn.functional.relu, "swish": swish} + + +class Attention(nn.Module): + def __init__(self, config, vis): + super(Attention, self).__init__() + self.vis = vis + self.num_attention_heads = config.transformer["num_heads"] + self.attention_head_size = int(config.hidden_size / self.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = Linear(config.hidden_size, self.all_head_size) + self.key = Linear(config.hidden_size, self.all_head_size) + self.value = Linear(config.hidden_size, self.all_head_size) + + self.out = Linear(config.hidden_size, config.hidden_size) + self.attn_dropout = Dropout(config.transformer["attention_dropout_rate"]) + self.proj_dropout = Dropout(config.transformer["attention_dropout_rate"]) + + self.softmax = Softmax(dim=-1) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward(self, hidden_states): + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + attention_probs = self.softmax(attention_scores) + weights = attention_probs if self.vis else None + attention_probs = self.attn_dropout(attention_probs) + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + attention_output = self.out(context_layer) + attention_output = self.proj_dropout(attention_output) + return attention_output, weights + + +class Mlp(nn.Module): + def __init__(self, config): + super(Mlp, self).__init__() + self.fc1 = Linear(config.hidden_size, config.transformer["mlp_dim"]) + self.fc2 = Linear(config.transformer["mlp_dim"], config.hidden_size) + self.act_fn = ACT2FN["gelu"] + self.dropout = Dropout(config.transformer["dropout_rate"]) + + self._init_weights() + + def _init_weights(self): + nn.init.xavier_uniform_(self.fc1.weight) + nn.init.xavier_uniform_(self.fc2.weight) + nn.init.normal_(self.fc1.bias, std=1e-6) + nn.init.normal_(self.fc2.bias, std=1e-6) + + def forward(self, x): + x = self.fc1(x) + x = self.act_fn(x) + x = self.dropout(x) + x = self.fc2(x) + x = self.dropout(x) + return x + + +class Embeddings(nn.Module): + """Construct the embeddings from patch, position embeddings. + """ + def __init__(self, config, img_size, in_channels=3): + super(Embeddings, self).__init__() + self.hybrid = None + self.config = config + img_size = _pair(img_size) + + if config.patches.get("grid") is not None: # ResNet + grid_size = config.patches["grid"] + patch_size = (img_size[0] // 16 // grid_size[0], img_size[1] // 16 // grid_size[1]) + patch_size_real = (patch_size[0] * 16, patch_size[1] * 16) + n_patches = (img_size[0] // patch_size_real[0]) * (img_size[1] // patch_size_real[1]) + self.hybrid = True + else: + patch_size = _pair(config.patches["size"]) + n_patches = (img_size[0] // patch_size[0]) * (img_size[1] // patch_size[1]) + self.hybrid = False + + if self.hybrid: + self.hybrid_model = ResNetV2(block_units=config.resnet.num_layers, width_factor=config.resnet.width_factor) + in_channels = self.hybrid_model.width * 16 + self.patch_embeddings = Conv2d(in_channels=in_channels, + out_channels=config.hidden_size, + kernel_size=patch_size, + stride=patch_size) + self.position_embeddings = nn.Parameter(torch.zeros(1, n_patches, config.hidden_size)) + + self.dropout = Dropout(config.transformer["dropout_rate"]) + + + def forward(self, x): + if self.hybrid: + x, features = self.hybrid_model(x) + else: + features = None + x = self.patch_embeddings(x) # (B, hidden. n_patches^(1/2), n_patches^(1/2)) + x = x.flatten(2) + x = x.transpose(-1, -2) # (B, n_patches, hidden) + + embeddings = x + self.position_embeddings + embeddings = self.dropout(embeddings) + return embeddings, features + + +class Block(nn.Module): + def __init__(self, config, vis): + super(Block, self).__init__() + self.hidden_size = config.hidden_size + self.attention_norm = LayerNorm(config.hidden_size, eps=1e-6) + self.ffn_norm = LayerNorm(config.hidden_size, eps=1e-6) + self.ffn = Mlp(config) + self.attn = Attention(config, vis) + + def forward(self, x): + h = x + x = self.attention_norm(x) + x, weights = self.attn(x) + x = x + h + + h = x + x = self.ffn_norm(x) + x = self.ffn(x) + x = x + h + return x, weights + + def load_from(self, weights, n_block): + ROOT = f"Transformer/encoderblock_{n_block}" + with torch.no_grad(): + query_weight = np2th(weights[pjoin(ROOT, ATTENTION_Q, "kernel")]).view(self.hidden_size, self.hidden_size).t() + key_weight = np2th(weights[pjoin(ROOT, ATTENTION_K, "kernel")]).view(self.hidden_size, self.hidden_size).t() + value_weight = np2th(weights[pjoin(ROOT, ATTENTION_V, "kernel")]).view(self.hidden_size, self.hidden_size).t() + out_weight = np2th(weights[pjoin(ROOT, ATTENTION_OUT, "kernel")]).view(self.hidden_size, self.hidden_size).t() + + query_bias = np2th(weights[pjoin(ROOT, ATTENTION_Q, "bias")]).view(-1) + key_bias = np2th(weights[pjoin(ROOT, ATTENTION_K, "bias")]).view(-1) + value_bias = np2th(weights[pjoin(ROOT, ATTENTION_V, "bias")]).view(-1) + out_bias = np2th(weights[pjoin(ROOT, ATTENTION_OUT, "bias")]).view(-1) + + self.attn.query.weight.copy_(query_weight) + self.attn.key.weight.copy_(key_weight) + self.attn.value.weight.copy_(value_weight) + self.attn.out.weight.copy_(out_weight) + self.attn.query.bias.copy_(query_bias) + self.attn.key.bias.copy_(key_bias) + self.attn.value.bias.copy_(value_bias) + self.attn.out.bias.copy_(out_bias) + + mlp_weight_0 = np2th(weights[pjoin(ROOT, FC_0, "kernel")]).t() + mlp_weight_1 = np2th(weights[pjoin(ROOT, FC_1, "kernel")]).t() + mlp_bias_0 = np2th(weights[pjoin(ROOT, FC_0, "bias")]).t() + mlp_bias_1 = np2th(weights[pjoin(ROOT, FC_1, "bias")]).t() + + self.ffn.fc1.weight.copy_(mlp_weight_0) + self.ffn.fc2.weight.copy_(mlp_weight_1) + self.ffn.fc1.bias.copy_(mlp_bias_0) + self.ffn.fc2.bias.copy_(mlp_bias_1) + + self.attention_norm.weight.copy_(np2th(weights[pjoin(ROOT, ATTENTION_NORM, "scale")])) + self.attention_norm.bias.copy_(np2th(weights[pjoin(ROOT, ATTENTION_NORM, "bias")])) + self.ffn_norm.weight.copy_(np2th(weights[pjoin(ROOT, MLP_NORM, "scale")])) + self.ffn_norm.bias.copy_(np2th(weights[pjoin(ROOT, MLP_NORM, "bias")])) + + +class Encoder(nn.Module): + def __init__(self, config, vis): + super(Encoder, self).__init__() + self.vis = vis + self.layer = nn.ModuleList() + self.encoder_norm = LayerNorm(config.hidden_size, eps=1e-6) + for _ in range(config.transformer["num_layers"]): + layer = Block(config, vis) + self.layer.append(copy.deepcopy(layer)) + + def forward(self, hidden_states): + attn_weights = [] + for layer_block in self.layer: + hidden_states, weights = layer_block(hidden_states) + if self.vis: + attn_weights.append(weights) + encoded = self.encoder_norm(hidden_states) + return encoded, attn_weights + + +class Transformer(nn.Module): + def __init__(self, config, img_size, vis): + super(Transformer, self).__init__() + self.embeddings = Embeddings(config, img_size=img_size) + self.encoder = Encoder(config, vis) + + def forward(self, input_ids): + embedding_output, features = self.embeddings(input_ids) + encoded, attn_weights = self.encoder(embedding_output) # (B, n_patch, hidden) + return encoded, attn_weights, features + + +class Conv2dReLU(nn.Sequential): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + padding=0, + stride=1, + use_batchnorm=True, + ): + conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + bias=not (use_batchnorm), + ) + relu = nn.ReLU(inplace=True) + + bn = nn.BatchNorm2d(out_channels) + + super(Conv2dReLU, self).__init__(conv, bn, relu) + + +class DecoderBlock(nn.Module): + def __init__( + self, + in_channels, + out_channels, + skip_channels=0, + use_batchnorm=True, + ): + super().__init__() + self.conv1 = Conv2dReLU( + in_channels + skip_channels, + out_channels, + kernel_size=3, + padding=1, + use_batchnorm=use_batchnorm, + ) + self.conv2 = Conv2dReLU( + out_channels, + out_channels, + kernel_size=3, + padding=1, + use_batchnorm=use_batchnorm, + ) + self.up = nn.UpsamplingBilinear2d(scale_factor=2) + + def forward(self, x, skip=None): + x = self.up(x) + if skip is not None: + x = torch.cat([x, skip], dim=1) + x = self.conv1(x) + x = self.conv2(x) + return x + + +class SegmentationHead(nn.Sequential): + + def __init__(self, in_channels, out_channels, kernel_size=3, upsampling=1): + conv2d = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=kernel_size // 2) + upsampling = nn.UpsamplingBilinear2d(scale_factor=upsampling) if upsampling > 1 else nn.Identity() + super().__init__(conv2d, upsampling) + + +class DecoderCup(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + head_channels = 512 + self.conv_more = Conv2dReLU( + config.hidden_size, + head_channels, + kernel_size=3, + padding=1, + use_batchnorm=True, + ) + decoder_channels = config.decoder_channels + in_channels = [head_channels] + list(decoder_channels[:-1]) + out_channels = decoder_channels + + if self.config.n_skip != 0: + skip_channels = self.config.skip_channels + for i in range(4-self.config.n_skip): # re-select the skip channels according to n_skip + skip_channels[3-i]=0 + + else: + skip_channels=[0,0,0,0] + + blocks = [ + DecoderBlock(in_ch, out_ch, sk_ch) for in_ch, out_ch, sk_ch in zip(in_channels, out_channels, skip_channels) + ] + self.blocks = nn.ModuleList(blocks) + + def forward(self, hidden_states, features=None): + B, n_patch, hidden = hidden_states.size() # reshape from (B, n_patch, hidden) to (B, h, w, hidden) + h, w = int(np.sqrt(n_patch)), int(np.sqrt(n_patch)) + x = hidden_states.permute(0, 2, 1) + x = x.contiguous().view(B, hidden, h, w) + x = self.conv_more(x) + for i, decoder_block in enumerate(self.blocks): + if features is not None: + skip = features[i] if (i < self.config.n_skip) else None + else: + skip = None + x = decoder_block(x, skip=skip) + return x + + +class VisionTransformer(nn.Module): + def __init__(self, config, img_size=224, num_classes=21843, zero_head=False, vis=False): + super(VisionTransformer, self).__init__() + self.num_classes = num_classes + self.zero_head = zero_head + self.classifier = config.classifier + self.transformer = Transformer(config, img_size, vis) + self.decoder = DecoderCup(config) + self.segmentation_head = SegmentationHead( + in_channels=config['decoder_channels'][-1], + out_channels=config['n_classes'], + kernel_size=3, + ) + self.config = config + + def forward(self, x): + if x.size()[1] == 1: + x = x.repeat(1,3,1,1) + x, attn_weights, features = self.transformer(x) # (B, n_patch, hidden) + x = self.decoder(x, features) + logits = self.segmentation_head(x) + return logits + + def load_from(self, weights): + with torch.no_grad(): + + res_weight = weights + self.transformer.embeddings.patch_embeddings.weight.copy_(np2th(weights["embedding/kernel"], conv=True)) + self.transformer.embeddings.patch_embeddings.bias.copy_(np2th(weights["embedding/bias"])) + + self.transformer.encoder.encoder_norm.weight.copy_(np2th(weights["Transformer/encoder_norm/scale"])) + self.transformer.encoder.encoder_norm.bias.copy_(np2th(weights["Transformer/encoder_norm/bias"])) + + posemb = np2th(weights["Transformer/posembed_input/pos_embedding"]) + + posemb_new = self.transformer.embeddings.position_embeddings + if posemb.size() == posemb_new.size(): + self.transformer.embeddings.position_embeddings.copy_(posemb) + elif posemb.size()[1]-1 == posemb_new.size()[1]: + posemb = posemb[:, 1:] + self.transformer.embeddings.position_embeddings.copy_(posemb) + else: + logger.info("load_pretrained: resized variant: %s to %s" % (posemb.size(), posemb_new.size())) + ntok_new = posemb_new.size(1) + if self.classifier == "seg": + _, posemb_grid = posemb[:, :1], posemb[0, 1:] + gs_old = int(np.sqrt(len(posemb_grid))) + gs_new = int(np.sqrt(ntok_new)) + print('load_pretrained: grid-size from %s to %s' % (gs_old, gs_new)) + posemb_grid = posemb_grid.reshape(gs_old, gs_old, -1) + zoom = (gs_new / gs_old, gs_new / gs_old, 1) + posemb_grid = ndimage.zoom(posemb_grid, zoom, order=1) # th2np + posemb_grid = posemb_grid.reshape(1, gs_new * gs_new, -1) + posemb = posemb_grid + self.transformer.embeddings.position_embeddings.copy_(np2th(posemb)) + + # Encoder whole + for bname, block in self.transformer.encoder.named_children(): + for uname, unit in block.named_children(): + unit.load_from(weights, n_block=uname) + + if self.transformer.embeddings.hybrid: + self.transformer.embeddings.hybrid_model.root.conv.weight.copy_(np2th(res_weight["conv_root/kernel"], conv=True)) + gn_weight = np2th(res_weight["gn_root/scale"]).view(-1) + gn_bias = np2th(res_weight["gn_root/bias"]).view(-1) + self.transformer.embeddings.hybrid_model.root.gn.weight.copy_(gn_weight) + self.transformer.embeddings.hybrid_model.root.gn.bias.copy_(gn_bias) + + for bname, block in self.transformer.embeddings.hybrid_model.body.named_children(): + for uname, unit in block.named_children(): + unit.load_from(res_weight, n_block=bname, n_unit=uname) + +CONFIGS = { + 'ViT-B_16': configs.get_b16_config(), + 'ViT-B_32': configs.get_b32_config(), + 'ViT-L_16': configs.get_l16_config(), + 'ViT-L_32': configs.get_l32_config(), + 'ViT-H_14': configs.get_h14_config(), + 'R50-ViT-B_16': configs.get_r50_b16_config(), + 'R50-ViT-L_16': configs.get_r50_l16_config(), + 'testing': configs.get_testing(), +} + + diff --git a/PuzzleTuning/SSL_structures/TransUNet_main/networks/vit_seg_modeling_resnet_skip.py b/PuzzleTuning/SSL_structures/TransUNet_main/networks/vit_seg_modeling_resnet_skip.py new file mode 100644 index 0000000000000000000000000000000000000000..9753d52fbe8275e77cc18870c1e9f9564d8cc008 --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransUNet_main/networks/vit_seg_modeling_resnet_skip.py @@ -0,0 +1,160 @@ +import math + +from os.path import join as pjoin +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def np2th(weights, conv=False): + """Possibly convert HWIO to OIHW.""" + if conv: + weights = weights.transpose([3, 2, 0, 1]) + return torch.from_numpy(weights) + + +class StdConv2d(nn.Conv2d): + + def forward(self, x): + w = self.weight + v, m = torch.var_mean(w, dim=[1, 2, 3], keepdim=True, unbiased=False) + w = (w - m) / torch.sqrt(v + 1e-5) + return F.conv2d(x, w, self.bias, self.stride, self.padding, + self.dilation, self.groups) + + +def conv3x3(cin, cout, stride=1, groups=1, bias=False): + return StdConv2d(cin, cout, kernel_size=3, stride=stride, + padding=1, bias=bias, groups=groups) + + +def conv1x1(cin, cout, stride=1, bias=False): + return StdConv2d(cin, cout, kernel_size=1, stride=stride, + padding=0, bias=bias) + + +class PreActBottleneck(nn.Module): + """Pre-activation (v2) bottleneck block. + """ + + def __init__(self, cin, cout=None, cmid=None, stride=1): + super().__init__() + cout = cout or cin + cmid = cmid or cout//4 + + self.gn1 = nn.GroupNorm(32, cmid, eps=1e-6) + self.conv1 = conv1x1(cin, cmid, bias=False) + self.gn2 = nn.GroupNorm(32, cmid, eps=1e-6) + self.conv2 = conv3x3(cmid, cmid, stride, bias=False) # Original code has it on conv1!! + self.gn3 = nn.GroupNorm(32, cout, eps=1e-6) + self.conv3 = conv1x1(cmid, cout, bias=False) + self.relu = nn.ReLU(inplace=True) + + if (stride != 1 or cin != cout): + # Projection also with pre-activation according to paper. + self.downsample = conv1x1(cin, cout, stride, bias=False) + self.gn_proj = nn.GroupNorm(cout, cout) + + def forward(self, x): + + # Residual branch + residual = x + if hasattr(self, 'downsample'): + residual = self.downsample(x) + residual = self.gn_proj(residual) + + # Unit's branch + y = self.relu(self.gn1(self.conv1(x))) + y = self.relu(self.gn2(self.conv2(y))) + y = self.gn3(self.conv3(y)) + + y = self.relu(residual + y) + return y + + def load_from(self, weights, n_block, n_unit): + conv1_weight = np2th(weights[pjoin(n_block, n_unit, "conv1/kernel")], conv=True) + conv2_weight = np2th(weights[pjoin(n_block, n_unit, "conv2/kernel")], conv=True) + conv3_weight = np2th(weights[pjoin(n_block, n_unit, "conv3/kernel")], conv=True) + + gn1_weight = np2th(weights[pjoin(n_block, n_unit, "gn1/scale")]) + gn1_bias = np2th(weights[pjoin(n_block, n_unit, "gn1/bias")]) + + gn2_weight = np2th(weights[pjoin(n_block, n_unit, "gn2/scale")]) + gn2_bias = np2th(weights[pjoin(n_block, n_unit, "gn2/bias")]) + + gn3_weight = np2th(weights[pjoin(n_block, n_unit, "gn3/scale")]) + gn3_bias = np2th(weights[pjoin(n_block, n_unit, "gn3/bias")]) + + self.conv1.weight.copy_(conv1_weight) + self.conv2.weight.copy_(conv2_weight) + self.conv3.weight.copy_(conv3_weight) + + self.gn1.weight.copy_(gn1_weight.view(-1)) + self.gn1.bias.copy_(gn1_bias.view(-1)) + + self.gn2.weight.copy_(gn2_weight.view(-1)) + self.gn2.bias.copy_(gn2_bias.view(-1)) + + self.gn3.weight.copy_(gn3_weight.view(-1)) + self.gn3.bias.copy_(gn3_bias.view(-1)) + + if hasattr(self, 'downsample'): + proj_conv_weight = np2th(weights[pjoin(n_block, n_unit, "conv_proj/kernel")], conv=True) + proj_gn_weight = np2th(weights[pjoin(n_block, n_unit, "gn_proj/scale")]) + proj_gn_bias = np2th(weights[pjoin(n_block, n_unit, "gn_proj/bias")]) + + self.downsample.weight.copy_(proj_conv_weight) + self.gn_proj.weight.copy_(proj_gn_weight.view(-1)) + self.gn_proj.bias.copy_(proj_gn_bias.view(-1)) + +class ResNetV2(nn.Module): + """Implementation of Pre-activation (v2) ResNet mode.""" + + def __init__(self, block_units, width_factor): + super().__init__() + width = int(64 * width_factor) + self.width = width + + self.root = nn.Sequential(OrderedDict([ + ('conv', StdConv2d(3, width, kernel_size=7, stride=2, bias=False, padding=3)), + ('gn', nn.GroupNorm(32, width, eps=1e-6)), + ('relu', nn.ReLU(inplace=True)), + # ('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=0)) + ])) + + self.body = nn.Sequential(OrderedDict([ + ('block1', nn.Sequential(OrderedDict( + [('unit1', PreActBottleneck(cin=width, cout=width*4, cmid=width))] + + [(f'unit{i:d}', PreActBottleneck(cin=width*4, cout=width*4, cmid=width)) for i in range(2, block_units[0] + 1)], + ))), + ('block2', nn.Sequential(OrderedDict( + [('unit1', PreActBottleneck(cin=width*4, cout=width*8, cmid=width*2, stride=2))] + + [(f'unit{i:d}', PreActBottleneck(cin=width*8, cout=width*8, cmid=width*2)) for i in range(2, block_units[1] + 1)], + ))), + ('block3', nn.Sequential(OrderedDict( + [('unit1', PreActBottleneck(cin=width*8, cout=width*16, cmid=width*4, stride=2))] + + [(f'unit{i:d}', PreActBottleneck(cin=width*16, cout=width*16, cmid=width*4)) for i in range(2, block_units[2] + 1)], + ))), + ])) + + def forward(self, x): + features = [] + b, c, in_size, _ = x.size() + x = self.root(x) + features.append(x) + x = nn.MaxPool2d(kernel_size=3, stride=2, padding=0)(x) + for i in range(len(self.body)-1): + x = self.body[i](x) + right_size = int(in_size / 4 / (i+1)) + if x.size()[2] != right_size: + pad = right_size - x.size()[2] + assert pad < 3 and pad > 0, "x {} should {}".format(x.size(), right_size) + feat = torch.zeros((b, x.size()[1], right_size, right_size), device=x.device) + feat[:, :, 0:x.size()[2], 0:x.size()[3]] = x[:] + else: + feat = x + features.append(feat) + x = self.body[-1](x) + return x, features[::-1] diff --git a/PuzzleTuning/SSL_structures/TransUNet_main/requirements.txt b/PuzzleTuning/SSL_structures/TransUNet_main/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4abfe422e0bd10ed594596292121fb6eac4d4581 --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransUNet_main/requirements.txt @@ -0,0 +1,11 @@ +torch==1.4.0 +torchvision==0.5.0 +numpy +tqdm +tensorboard +tensorboardX +ml-collections +medpy +SimpleITK +scipy +h5py diff --git a/PuzzleTuning/SSL_structures/TransUNet_main/test.py b/PuzzleTuning/SSL_structures/TransUNet_main/test.py new file mode 100644 index 0000000000000000000000000000000000000000..35a48027e952822f29b7f439f85007ee81d9b92e --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransUNet_main/test.py @@ -0,0 +1,140 @@ +import argparse +import logging +import os +import random +import sys +import numpy as np +import torch +import torch.backends.cudnn as cudnn +import torch.nn as nn +from torch.utils.data import DataLoader +from tqdm import tqdm +from datasets.dataset_synapse import Synapse_dataset +from utils import test_single_volume +from networks.vit_seg_modeling import VisionTransformer as ViT_seg +from networks.vit_seg_modeling import CONFIGS as CONFIGS_ViT_seg + +parser = argparse.ArgumentParser() +parser.add_argument('--volume_path', type=str, + default='../data/Synapse/test_vol_h5', help='root dir for validation volume data') # for acdc volume_path=root_dir +parser.add_argument('--dataset', type=str, + default='Synapse', help='experiment_name') +parser.add_argument('--num_classes', type=int, + default=4, help='output channel of network') +parser.add_argument('--list_dir', type=str, + default='./lists/lists_Synapse', help='list dir') + +parser.add_argument('--max_iterations', type=int,default=20000, help='maximum epoch number to train') +parser.add_argument('--max_epochs', type=int, default=30, help='maximum epoch number to train') +parser.add_argument('--batch_size', type=int, default=24, + help='batch_size per gpu') +parser.add_argument('--img_size', type=int, default=224, help='input patch size of network input') +parser.add_argument('--is_savenii', action="store_true", help='whether to save results during inference') + +parser.add_argument('--n_skip', type=int, default=3, help='using number of skip-connect, default is num') +parser.add_argument('--vit_name', type=str, default='ViT-B_16', help='select one vit model') + +parser.add_argument('--test_save_dir', type=str, default='../predictions', help='saving prediction as nii!') +parser.add_argument('--deterministic', type=int, default=1, help='whether use deterministic training') +parser.add_argument('--base_lr', type=float, default=0.01, help='segmentation network learning rate') +parser.add_argument('--seed', type=int, default=1234, help='random seed') +parser.add_argument('--vit_patches_size', type=int, default=16, help='vit_patches_size, default is 16') +args = parser.parse_args() + + +def inference(args, model, test_save_path=None): + db_test = args.Dataset(base_dir=args.volume_path, split="test_vol", list_dir=args.list_dir) + testloader = DataLoader(db_test, batch_size=1, shuffle=False, num_workers=1) + logging.info("{} test iterations per epoch".format(len(testloader))) + model.eval() + metric_list = 0.0 + for i_batch, sampled_batch in tqdm(enumerate(testloader)): + h, w = sampled_batch["image"].size()[2:] + image, label, case_name = sampled_batch["image"], sampled_batch["label"], sampled_batch['case_name'][0] + metric_i = test_single_volume(image, label, model, classes=args.num_classes, patch_size=[args.img_size, args.img_size], + test_save_path=test_save_path, case=case_name, z_spacing=args.z_spacing) + metric_list += np.array(metric_i) + logging.info('idx %d case %s mean_dice %f mean_hd95 %f' % (i_batch, case_name, np.mean(metric_i, axis=0)[0], np.mean(metric_i, axis=0)[1])) + metric_list = metric_list / len(db_test) + for i in range(1, args.num_classes): + logging.info('Mean class %d mean_dice %f mean_hd95 %f' % (i, metric_list[i-1][0], metric_list[i-1][1])) + performance = np.mean(metric_list, axis=0)[0] + mean_hd95 = np.mean(metric_list, axis=0)[1] + logging.info('Testing performance in best val model: mean_dice : %f mean_hd95 : %f' % (performance, mean_hd95)) + return "Testing Finished!" + + +if __name__ == "__main__": + + if not args.deterministic: + cudnn.benchmark = True + cudnn.deterministic = False + else: + cudnn.benchmark = False + cudnn.deterministic = True + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed(args.seed) + + dataset_config = { + 'Synapse': { + 'Dataset': Synapse_dataset, + 'volume_path': '../data/Synapse/test_vol_h5', + 'list_dir': './lists/lists_Synapse', + 'num_classes': 9, + 'z_spacing': 1, + }, + } + dataset_name = args.dataset + args.num_classes = dataset_config[dataset_name]['num_classes'] + args.volume_path = dataset_config[dataset_name]['volume_path'] + args.Dataset = dataset_config[dataset_name]['Dataset'] + args.list_dir = dataset_config[dataset_name]['list_dir'] + args.z_spacing = dataset_config[dataset_name]['z_spacing'] + args.is_pretrain = True + + # name the same snapshot defined in train script! + args.exp = 'TU_' + dataset_name + str(args.img_size) + snapshot_path = "../model/{}/{}".format(args.exp, 'TU') + snapshot_path = snapshot_path + '_pretrain' if args.is_pretrain else snapshot_path + snapshot_path += '_' + args.vit_name + snapshot_path = snapshot_path + '_skip' + str(args.n_skip) + snapshot_path = snapshot_path + '_vitpatch' + str(args.vit_patches_size) if args.vit_patches_size!=16 else snapshot_path + snapshot_path = snapshot_path + '_epo' + str(args.max_epochs) if args.max_epochs != 30 else snapshot_path + if dataset_name == 'ACDC': # using max_epoch instead of iteration to control training duration + snapshot_path = snapshot_path + '_' + str(args.max_iterations)[0:2] + 'k' if args.max_iterations != 30000 else snapshot_path + snapshot_path = snapshot_path+'_bs'+str(args.batch_size) + snapshot_path = snapshot_path + '_lr' + str(args.base_lr) if args.base_lr != 0.01 else snapshot_path + snapshot_path = snapshot_path + '_'+str(args.img_size) + snapshot_path = snapshot_path + '_s'+str(args.seed) if args.seed!=1234 else snapshot_path + + config_vit = CONFIGS_ViT_seg[args.vit_name] + config_vit.n_classes = args.num_classes + config_vit.n_skip = args.n_skip + config_vit.patches.size = (args.vit_patches_size, args.vit_patches_size) + if args.vit_name.find('R50') !=-1: + config_vit.patches.grid = (int(args.img_size/args.vit_patches_size), int(args.img_size/args.vit_patches_size)) + net = ViT_seg(config_vit, img_size=args.img_size, num_classes=config_vit.n_classes).cuda() + + snapshot = os.path.join(snapshot_path, 'best_model.pth') + if not os.path.exists(snapshot): snapshot = snapshot.replace('best_model', 'epoch_'+str(args.max_epochs-1)) + net.load_state_dict(torch.load(snapshot)) + snapshot_name = snapshot_path.split('/')[-1] + + log_folder = './test_log/test_log_' + args.exp + os.makedirs(log_folder, exist_ok=True) + logging.basicConfig(filename=log_folder + '/'+snapshot_name+".txt", level=logging.INFO, format='[%(asctime)s.%(msecs)03d] %(message)s', datefmt='%H:%M:%S') + logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) + logging.info(str(args)) + logging.info(snapshot_name) + + if args.is_savenii: + args.test_save_dir = '../predictions' + test_save_path = os.path.join(args.test_save_dir, args.exp, snapshot_name) + os.makedirs(test_save_path, exist_ok=True) + else: + test_save_path = None + inference(args, net, test_save_path) + + diff --git a/PuzzleTuning/SSL_structures/TransUNet_main/train.py b/PuzzleTuning/SSL_structures/TransUNet_main/train.py new file mode 100644 index 0000000000000000000000000000000000000000..438dc76b9d2a00f2abe4aacca9d7279dcad4685d --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransUNet_main/train.py @@ -0,0 +1,93 @@ +import argparse +import logging +import os +import random +import numpy as np +import torch +import torch.backends.cudnn as cudnn +from networks.vit_seg_modeling import VisionTransformer as ViT_seg +from networks.vit_seg_modeling import CONFIGS as CONFIGS_ViT_seg +from trainer import trainer_synapse + +parser = argparse.ArgumentParser() +parser.add_argument('--root_path', type=str, + default='../data/Synapse/train_npz', help='root dir for data') +parser.add_argument('--dataset', type=str, + default='Synapse', help='experiment_name') +parser.add_argument('--list_dir', type=str, + default='./lists/lists_Synapse', help='list dir') +parser.add_argument('--num_classes', type=int, + default=9, help='output channel of network') +parser.add_argument('--max_iterations', type=int, + default=30000, help='maximum epoch number to train') +parser.add_argument('--max_epochs', type=int, + default=150, help='maximum epoch number to train') +parser.add_argument('--batch_size', type=int, + default=24, help='batch_size per gpu') +parser.add_argument('--n_gpu', type=int, default=1, help='total gpu') +parser.add_argument('--deterministic', type=int, default=1, + help='whether use deterministic training') +parser.add_argument('--base_lr', type=float, default=0.01, + help='segmentation network learning rate') +parser.add_argument('--img_size', type=int, + default=224, help='input patch size of network input') +parser.add_argument('--seed', type=int, + default=1234, help='random seed') +parser.add_argument('--n_skip', type=int, + default=3, help='using number of skip-connect, default is num') +parser.add_argument('--vit_name', type=str, + default='R50-ViT-B_16', help='select one vit model') +parser.add_argument('--vit_patches_size', type=int, + default=16, help='vit_patches_size, default is 16') +args = parser.parse_args() + + +if __name__ == "__main__": + if not args.deterministic: + cudnn.benchmark = True + cudnn.deterministic = False + else: + cudnn.benchmark = False + cudnn.deterministic = True + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed(args.seed) + dataset_name = args.dataset + dataset_config = { + 'Synapse': { + 'root_path': '../data/Synapse/train_npz', + 'list_dir': './lists/lists_Synapse', + 'num_classes': 9, + }, + } + args.num_classes = dataset_config[dataset_name]['num_classes'] + args.root_path = dataset_config[dataset_name]['root_path'] + args.list_dir = dataset_config[dataset_name]['list_dir'] + args.is_pretrain = True + args.exp = 'TU_' + dataset_name + str(args.img_size) + snapshot_path = "../model/{}/{}".format(args.exp, 'TU') + snapshot_path = snapshot_path + '_pretrain' if args.is_pretrain else snapshot_path + snapshot_path += '_' + args.vit_name + snapshot_path = snapshot_path + '_skip' + str(args.n_skip) + snapshot_path = snapshot_path + '_vitpatch' + str(args.vit_patches_size) if args.vit_patches_size!=16 else snapshot_path + snapshot_path = snapshot_path+'_'+str(args.max_iterations)[0:2]+'k' if args.max_iterations != 30000 else snapshot_path + snapshot_path = snapshot_path + '_epo' +str(args.max_epochs) if args.max_epochs != 30 else snapshot_path + snapshot_path = snapshot_path+'_bs'+str(args.batch_size) + snapshot_path = snapshot_path + '_lr' + str(args.base_lr) if args.base_lr != 0.01 else snapshot_path + snapshot_path = snapshot_path + '_'+str(args.img_size) + snapshot_path = snapshot_path + '_s'+str(args.seed) if args.seed!=1234 else snapshot_path + + if not os.path.exists(snapshot_path): + os.makedirs(snapshot_path) + config_vit = CONFIGS_ViT_seg[args.vit_name] + config_vit.n_classes = args.num_classes + config_vit.n_skip = args.n_skip + if args.vit_name.find('R50') != -1: + config_vit.patches.grid = (int(args.img_size / args.vit_patches_size), int(args.img_size / args.vit_patches_size)) + net = ViT_seg(config_vit, img_size=args.img_size, num_classes=config_vit.n_classes).cuda() + net.load_from(weights=np.load(config_vit.pretrained_path)) + + trainer = {'Synapse': trainer_synapse,} + trainer[dataset_name](args, net, snapshot_path) \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/TransUNet_main/trainer.py b/PuzzleTuning/SSL_structures/TransUNet_main/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..2445e10ce2ef85789041532ec47d6f9674016070 --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransUNet_main/trainer.py @@ -0,0 +1,96 @@ +import argparse +import logging +import os +import random +import sys +import time +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +from tensorboardX import SummaryWriter +from torch.nn.modules.loss import CrossEntropyLoss +from torch.utils.data import DataLoader +from tqdm import tqdm +from utils import DiceLoss +from torchvision import transforms + +def trainer_synapse(args, model, snapshot_path): + from datasets.dataset_synapse import Synapse_dataset, RandomGenerator + logging.basicConfig(filename=snapshot_path + "/log.txt", level=logging.INFO, + format='[%(asctime)s.%(msecs)03d] %(message)s', datefmt='%H:%M:%S') + logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) + logging.info(str(args)) + base_lr = args.base_lr + num_classes = args.num_classes + batch_size = args.batch_size * args.n_gpu + # max_iterations = args.max_iterations + db_train = Synapse_dataset(base_dir=args.root_path, list_dir=args.list_dir, split="train", + transform=transforms.Compose( + [RandomGenerator(output_size=[args.img_size, args.img_size])])) + print("The length of train set is: {}".format(len(db_train))) + + def worker_init_fn(worker_id): + random.seed(args.seed + worker_id) + + trainloader = DataLoader(db_train, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True, + worker_init_fn=worker_init_fn) + if args.n_gpu > 1: + model = nn.DataParallel(model) + model.train() + ce_loss = CrossEntropyLoss() + dice_loss = DiceLoss(num_classes) + optimizer = optim.SGD(model.parameters(), lr=base_lr, momentum=0.9, weight_decay=0.0001) + writer = SummaryWriter(snapshot_path + '/log') + iter_num = 0 + max_epoch = args.max_epochs + max_iterations = args.max_epochs * len(trainloader) # max_epoch = max_iterations // len(trainloader) + 1 + logging.info("{} iterations per epoch. {} max iterations ".format(len(trainloader), max_iterations)) + best_performance = 0.0 + iterator = tqdm(range(max_epoch), ncols=70) + for epoch_num in iterator: + for i_batch, sampled_batch in enumerate(trainloader): + image_batch, label_batch = sampled_batch['image'], sampled_batch['label'] + image_batch, label_batch = image_batch.cuda(), label_batch.cuda() + outputs = model(image_batch) + loss_ce = ce_loss(outputs, label_batch[:].long()) + loss_dice = dice_loss(outputs, label_batch, softmax=True) + loss = 0.5 * loss_ce + 0.5 * loss_dice + optimizer.zero_grad() + loss.backward() + optimizer.step() + lr_ = base_lr * (1.0 - iter_num / max_iterations) ** 0.9 + for param_group in optimizer.param_groups: + param_group['lr'] = lr_ + + iter_num = iter_num + 1 + writer.add_scalar('info/lr', lr_, iter_num) + writer.add_scalar('info/total_loss', loss, iter_num) + writer.add_scalar('info/loss_ce', loss_ce, iter_num) + + logging.info('iteration %d : loss : %f, loss_ce: %f' % (iter_num, loss.item(), loss_ce.item())) + + if iter_num % 20 == 0: + image = image_batch[1, 0:1, :, :] + image = (image - image.min()) / (image.max() - image.min()) + writer.add_image('train/Image', image, iter_num) + outputs = torch.argmax(torch.softmax(outputs, dim=1), dim=1, keepdim=True) + writer.add_image('train/Prediction', outputs[1, ...] * 50, iter_num) + labs = label_batch[1, ...].unsqueeze(0) * 50 + writer.add_image('train/GroundTruth', labs, iter_num) + + save_interval = 50 # int(max_epoch/6) + if epoch_num > int(max_epoch / 2) and (epoch_num + 1) % save_interval == 0: + save_mode_path = os.path.join(snapshot_path, 'epoch_' + str(epoch_num) + '.pth') + torch.save(model.state_dict(), save_mode_path) + logging.info("save model to {}".format(save_mode_path)) + + if epoch_num >= max_epoch - 1: + save_mode_path = os.path.join(snapshot_path, 'epoch_' + str(epoch_num) + '.pth') + torch.save(model.state_dict(), save_mode_path) + logging.info("save model to {}".format(save_mode_path)) + iterator.close() + break + + writer.close() + return "Training Finished!" \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/TransUNet_main/utils.py b/PuzzleTuning/SSL_structures/TransUNet_main/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0e3a1bf9ad7058f506faff0a8ae4f6056514575c --- /dev/null +++ b/PuzzleTuning/SSL_structures/TransUNet_main/utils.py @@ -0,0 +1,102 @@ +import numpy as np +import torch +from medpy import metric +from scipy.ndimage import zoom +import torch.nn as nn +import SimpleITK as sitk + + +class DiceLoss(nn.Module): + def __init__(self, n_classes): + super(DiceLoss, self).__init__() + self.n_classes = n_classes + + def _one_hot_encoder(self, input_tensor): + tensor_list = [] + for i in range(self.n_classes): + temp_prob = input_tensor == i # * torch.ones_like(input_tensor) + tensor_list.append(temp_prob.unsqueeze(1)) + output_tensor = torch.cat(tensor_list, dim=1) + return output_tensor.float() + + def _dice_loss(self, score, target): + target = target.float() + smooth = 1e-5 + intersect = torch.sum(score * target) + y_sum = torch.sum(target * target) + z_sum = torch.sum(score * score) + loss = (2 * intersect + smooth) / (z_sum + y_sum + smooth) + loss = 1 - loss + return loss + + def forward(self, inputs, target, weight=None, softmax=False): + if softmax: + inputs = torch.softmax(inputs, dim=1) + target = self._one_hot_encoder(target) + if weight is None: + weight = [1] * self.n_classes + assert inputs.size() == target.size(), 'predict {} & target {} shape do not match'.format(inputs.size(), target.size()) + class_wise_dice = [] + loss = 0.0 + for i in range(0, self.n_classes): + dice = self._dice_loss(inputs[:, i], target[:, i]) + class_wise_dice.append(1.0 - dice.item()) + loss += dice * weight[i] + return loss / self.n_classes + + +def calculate_metric_percase(pred, gt): + pred[pred > 0] = 1 + gt[gt > 0] = 1 + if pred.sum() > 0 and gt.sum()>0: + dice = metric.binary.dc(pred, gt) + hd95 = metric.binary.hd95(pred, gt) + return dice, hd95 + elif pred.sum() > 0 and gt.sum()==0: + return 1, 0 + else: + return 0, 0 + + +def test_single_volume(image, label, net, classes, patch_size=[256, 256], test_save_path=None, case=None, z_spacing=1): + image, label = image.squeeze(0).cpu().detach().numpy(), label.squeeze(0).cpu().detach().numpy() + if len(image.shape) == 3: + prediction = np.zeros_like(label) + for ind in range(image.shape[0]): + slice = image[ind, :, :] + x, y = slice.shape[0], slice.shape[1] + if x != patch_size[0] or y != patch_size[1]: + slice = zoom(slice, (patch_size[0] / x, patch_size[1] / y), order=3) # previous using 0 + input = torch.from_numpy(slice).unsqueeze(0).unsqueeze(0).float().cuda() + net.eval() + with torch.no_grad(): + outputs = net(input) + out = torch.argmax(torch.softmax(outputs, dim=1), dim=1).squeeze(0) + out = out.cpu().detach().numpy() + if x != patch_size[0] or y != patch_size[1]: + pred = zoom(out, (x / patch_size[0], y / patch_size[1]), order=0) + else: + pred = out + prediction[ind] = pred + else: + input = torch.from_numpy(image).unsqueeze( + 0).unsqueeze(0).float().cuda() + net.eval() + with torch.no_grad(): + out = torch.argmax(torch.softmax(net(input), dim=1), dim=1).squeeze(0) + prediction = out.cpu().detach().numpy() + metric_list = [] + for i in range(1, classes): + metric_list.append(calculate_metric_percase(prediction == i, label == i)) + + if test_save_path is not None: + img_itk = sitk.GetImageFromArray(image.astype(np.float32)) + prd_itk = sitk.GetImageFromArray(prediction.astype(np.float32)) + lab_itk = sitk.GetImageFromArray(label.astype(np.float32)) + img_itk.SetSpacing((1, 1, z_spacing)) + prd_itk.SetSpacing((1, 1, z_spacing)) + lab_itk.SetSpacing((1, 1, z_spacing)) + sitk.WriteImage(prd_itk, test_save_path + '/'+case + "_pred.nii.gz") + sitk.WriteImage(img_itk, test_save_path + '/'+ case + "_img.nii.gz") + sitk.WriteImage(lab_itk, test_save_path + '/'+ case + "_gt.nii.gz") + return metric_list \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/UtnetV2/conv_layers.py b/PuzzleTuning/SSL_structures/UtnetV2/conv_layers.py new file mode 100644 index 0000000000000000000000000000000000000000..0f39d6c816e3a2c9b98334a6b8377e6c3251a997 --- /dev/null +++ b/PuzzleTuning/SSL_structures/UtnetV2/conv_layers.py @@ -0,0 +1,358 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from timm.models.layers import trunc_normal_, DropPath +import pdb + +__all__ = [ + 'ConvNormAct', + 'SingleConv', + 'BasicBlock', + 'Bottleneck', + 'DepthwiseSeparableConv', + 'SEBlock', + 'DropPath', + 'MBConv', + 'FusedMBConv', + 'ConvNeXtBlock', + 'LayerNorm' +] + +class ConvNormAct(nn.Module): + """ + Layer grouping a convolution, normalization and activation funtion + normalization includes BN and IN + """ + def __init__(self, in_ch, out_ch, kernel_size=3, stride=1, padding=0, + groups=1, dilation=1, bias=False, norm=nn.BatchNorm2d, act=nn.ReLU, preact=False): + + super().__init__() + assert norm in [nn.BatchNorm2d, nn.InstanceNorm2d, True, False] + assert act in [nn.ReLU, nn.ReLU6, nn.GELU, nn.SiLU, True, False] + + self.conv = nn.Conv2d( + in_channels=in_ch, + out_channels=out_ch, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + dilation=dilation, + bias=bias + ) + if preact: + self.norm = norm(in_ch) if norm else nn.Identity() + else: + self.norm = norm(out_ch) if norm else nn.Identity() + self.act = act() if act else nn.Identity() + self.preact = preact + + def forward(self, x): + + if self.preact: + out = self.conv(self.act(self.norm(x))) # norm relu conv + else: + out = self.act(self.norm(self.conv(x))) # conv norm relu + + return out + +class SingleConv(nn.Module): + def __init__(self, in_ch, out_ch, stride=1, norm=nn.BatchNorm2d, act=nn.ReLU, preact=False): + super().__init__() + assert norm in [nn.BatchNorm2d, nn.InstanceNorm2d, LayerNorm, True, False] + assert act in [nn.ReLU, nn.ReLU6, nn.GELU, nn.SiLU, True, False] + + + self.conv = ConvNormAct(in_ch, out_ch, 3, stride=stride, padding=1, norm=norm, act=act, preact=preact) + + def forward(self, x): + + return self.conv(x) + + + +class BasicBlock(nn.Module): + def __init__(self, in_ch, out_ch, stride=1, norm=nn.BatchNorm2d, act=nn.ReLU, preact=True): + super().__init__() + assert norm in [nn.BatchNorm2d, nn.InstanceNorm2d, True, False] + assert act in [nn.ReLU, nn.ReLU6, nn.GELU, nn.SiLU, True, False] + + self.conv1 = ConvNormAct(in_ch, out_ch, 3, stride=stride, padding=1, norm=norm, act=act, preact=preact) + self.conv2 = ConvNormAct(out_ch, out_ch, 3, stride=1, padding=1, norm=norm, act=act, preact=preact) + + self.shortcut = nn.Sequential() + if stride != 1 or in_ch != out_ch: # 如果不相等就通过一层conv将残差改变 + self.shortcut = ConvNormAct(in_ch, out_ch, 3, stride=stride, padding=1, norm=norm, act=act, preact=preact) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.conv2(out) + + out += self.shortcut(residual) + + return out + +class Bottleneck(nn.Module): + def __init__(self, in_ch, out_ch, stride=1, groups=1, dilation=1, norm=nn.BatchNorm2d, act=nn.ReLU, preact=True): + super().__init__() + assert norm in [nn.BatchNorm2d, nn.InstanceNorm2d, True, False] + assert act in [nn.ReLU, nn.ReLU6, nn.GELU, nn.SiLU, True, False] + self.expansion = 4 + self.conv1 = ConvNormAct(in_ch, out_ch//self.expansion, 1, stride=1, padding=0, norm=norm, act=act, preact=preact) + self.conv2 = ConvNormAct(out_ch//self.expansion, out_ch//self.expansion, 3, stride=stride, padding=1, norm=norm, act=act, groups=groups, dilation=dilation, preact=preact) + + self.conv3 = ConvNormAct(out_ch//self.expansion, out_ch, 1, stride=1, padding=0, norm=norm, act=act, preact=preact) + self.shortcut = nn.Sequential() + if stride != 1 or in_ch != out_ch: + self.shortcut = ConvNormAct(in_ch, out_ch, 3, stride=stride, padding=1, norm=norm, act=act, preact=preact) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.conv2(out) + out = self.conv3(out) + + out += self.shortcut(residual) + + return out + + + + +class DepthwiseSeparableConv(nn.Module): + def __init__(self, in_ch, out_ch, stride=1, kernel_size=3, padding=1, bias=False): + super().__init__() + self.depthwise = nn.Conv2d( + in_channels=in_ch, + out_channels=in_ch, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=in_ch, + bias=bias + ) + self.pointwise = nn.Conv2d( + in_channels=in_ch, + out_channels=out_ch, + kernel_size=1, + stride=1, + padding=0, + groups=1, + bias=bias + ) + def forward(self, x): + out = self.depthwise(x) + out = self.pointwise(out) + + return out + + +class SEBlock(nn.Module): + def __init__(self, in_ch, ratio=4, act=nn.ReLU): + super().__init__() + + self.squeeze = nn.AdaptiveAvgPool2d(1) + self.excitation = nn.Sequential( + nn.Conv2d(in_ch, in_ch//ratio, kernel_size=1), + act(), + nn.Conv2d(in_ch//ratio, in_ch, kernel_size=1), + nn.Sigmoid() + ) + def forward(self, x): + out = self.squeeze(x) + out = self.excitation(out) + + return x * out + +class DropPath(nn.Module): + """ + Drop connection with pobability p + """ + def __init__(self, p=0): + super().__init__() + + self.p = p + def forward(self, x): + if (not self.p) or (not self.training): + return x + + batch_size = x.shape[0] + random_tensor = torch.rand(batch_size, 1, 1, 1).to(x.device) + binary_mask = self.p < random_tensor + + x = x.div(1 - self.p) + x = x * binary_mask + + return x + +class MBConv(nn.Module): + """ + MBConv with an expansion factor of N, and squeeze-and-excitation module + """ + def __init__(self, in_ch, out_ch, expansion=4, kernel_size=3, stride=1, ratio=4, p=0, se=True, norm=nn.BatchNorm2d, act=nn.ReLU): + super().__init__() + + + padding = (kernel_size - 1) // 2 + expanded = expansion * in_ch + self.se = se + + self.expand_proj = nn.Identity() if (expansion==1) else ConvNormAct(in_ch, expanded, kernel_size=1, norm=norm, act=act, preact=True) + + self.depthwise = ConvNormAct(expanded, expanded, kernel_size=kernel_size, stride=stride, padding=padding, groups=expanded, act=act, norm=norm, preact=True) + + if self.se: + self.se_block = SEBlock(expanded, ratio=ratio) + + self.pointwise = ConvNormAct(expanded, out_ch, kernel_size=1, padding=0, norm=norm, act=False, preact=True) + + self.drop_path = DropPath(p) + + self.shortcut = nn.Sequential() + if in_ch != out_ch or stride != 1: + self.shortcut = nn.Sequential(ConvNormAct(in_ch, out_ch, kernel_size, stride=stride, padding=padding, norm=False, act=False)) + + def forward(self, x): + residual = x + + x = self.expand_proj(x) + x = self.depthwise(x) + if self.se: + x = self.se_block(x) + x = self.pointwise(x) + + x = self.drop_path(x) + + x = x + self.shortcut(residual) + + return x + +class FusedMBConv(nn.Module): + """ + MBConv with an expansion factor of N, and squeeze-and-excitation module + """ + def __init__(self, in_ch, out_ch, expansion=4, kernel_size=3, stride=1, ratio=4, p=0, se=True, norm=nn.BatchNorm2d, act=nn.ReLU): + super().__init__() + + + padding = (kernel_size - 1) // 2 + expanded = expansion * in_ch + + self.stride = stride + self.se = se + + self.conv3x3 = ConvNormAct(in_ch, expanded, kernel_size=kernel_size, stride=stride, padding=padding, groups=1, norm=norm, act=act, preact=True) + + if self.se: + self.se_block = SEBlock(expanded, ratio=ratio) + + self.pointwise = ConvNormAct(expanded, out_ch, kernel_size=1, padding=0, norm=norm, act=False, preact=True) + + self.drop_path = DropPath(p) + + self.shortcut = nn.Sequential() + if in_ch != out_ch or stride != 1: + self.shortcut = nn.Sequential(ConvNormAct(in_ch, out_ch, 3, stride=stride, padding=1, norm=False, act=False)) + + def forward(self, x): + residual = x + + x = self.conv3x3(x) + if self.se: + x = self.se_block(x) + x = self.pointwise(x) + + x = self.drop_path(x) + + x = x + self.shortcut(residual) + + return x + +class ConvNeXtBlock(nn.Module): + r""" ConvNeXt Block. There are two equivalent implementations: + (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) + (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back + We use (2) as we find it slightly faster in PyTorch + Args: + dim (int): Number of input channels. + drop_path (float): Stochastic depth rate. Default: 0.0 + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + """ + + def __init__(self, dim, out_ch, stride=1, kernel_size=7, norm=None, act=None, preact=None, drop_path=0., layer_scale_init_value=1e-6): + + super().__init__() + padding = kernel_size // 2 + self.dwconv = nn.Conv2d(dim, dim, kernel_size=kernel_size, padding=padding, groups=dim) # depthwise conv + self.norm = LayerNorm(dim, eps=1e-6) + self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.pwconv2 = nn.Linear(4 * dim, dim) + self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), + requires_grad=True) if layer_scale_init_value > 0 else None + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x): + input = x + x = self.dwconv(x) + x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) + x = self.norm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) + + x = input + self.drop_path(x) + + return x + +class LayerNorm(nn.Module): + + r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. + + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with + + shape (batch_size, height, width, channels) while channels_first corresponds to inputs + + with shape (batch_size, channels, height, width). + + """ + + def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): + super().__init__() + self.weight = nn.Parameter(torch.ones(normalized_shape)) + self.bias = nn.Parameter(torch.zeros(normalized_shape)) + self.eps = eps + self.data_format = data_format + + if self.data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError + self.normalized_shape = (normalized_shape, ) + + def forward(self, x): + if self.data_format == "channels_last": + return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + elif self.data_format == "channels_first": + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + + return x + + +if __name__ == '__main__': + img = torch.randn(2, 3, 256, 256) + depth_conv = DepthwiseSeparableConv(3, 32) + + out = depth_conv(img) + print(out.shape) + + + + diff --git a/PuzzleTuning/SSL_structures/UtnetV2/trans_layers.py b/PuzzleTuning/SSL_structures/UtnetV2/trans_layers.py new file mode 100644 index 0000000000000000000000000000000000000000..66a7b58a0da49c879ea042657b093716441a76ed --- /dev/null +++ b/PuzzleTuning/SSL_structures/UtnetV2/trans_layers.py @@ -0,0 +1,94 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange +import pdb + + +__all__ = [ + 'Mlp', + 'Attention', + 'TransformerBlock', +] + +class Mlp(nn.Module): + def __init__(self, in_dim, hid_dim=None, out_dim=None, act=nn.GELU, drop=0.): + super().__init__() + out_dim = out_dim or in_dim + hid_dim = hid_dim or in_dim + self.fc1 = nn.Linear(in_dim, hid_dim) + self.act = act() + self.fc2 = nn.Linear(hid_dim, out_dim) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + + return x + +class PreNorm(nn.Module): + def __init__(self, dim, fn): + super().__init__() + self.norm = nn.LayerNorm(dim) + self.fn = fn + def forward(self, x, **kwargs): + return self.fn(self.norm(x), **kwargs) + + + +class Attention(nn.Module): + def __init__(self, dim, heads, dim_head, attn_drop=0., proj_drop=0.): + super().__init__() + + inner_dim = dim_head * heads + + self.heads = heads + self.scale = dim_head ** -0.5 + + self.to_qkv = nn.Linear(dim, inner_dim*3, bias=False) + + self.to_out = nn.Linear(inner_dim, dim) + + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + # x: B, L, C. Batch, sequence length, dim + q, k, v = self.to_qkv(x).chunk(3, dim=-1) + + q, k, v = map(lambda t: rearrange(t, 'b l (heads dim_head) -> b heads l dim_head', heads=self.heads), [q, k, v]) + attn = torch.einsum('bhid,bhjd->bhij', q, k) * self.scale + + attn = F.softmax(attn, dim=-1) + + attned = torch.einsum('bhij,bhjd->bhid', attn, v) + attned = rearrange(attned, 'b heads l dim_head -> b l (dim_head heads)') + + attned = self.to_out(attned) + + return attned + + +class TransformerBlock(nn.Module): + def __init__(self, dim, depth, heads, dim_head, mlp_dim, attn_drop=0., proj_drop=0.): + super().__init__() + + self.layers = nn.ModuleList([]) + + for i in range(depth): + self.layers.append(nn.ModuleList([ + PreNorm(dim, Attention(dim, heads, dim_head, attn_drop, proj_drop)), + PreNorm(dim, Mlp(dim, mlp_dim, dim, drop=proj_drop)) + ])) + def forward(self, x): + + for attn, ffn in self.layers: + x = attn(x) + x + x = ffn(x) + x + + return x + + diff --git a/PuzzleTuning/SSL_structures/UtnetV2/utnetv2.py b/PuzzleTuning/SSL_structures/UtnetV2/utnetv2.py new file mode 100644 index 0000000000000000000000000000000000000000..8c5d8151220dc005f56bc9defbf4b1a451e16f8c --- /dev/null +++ b/PuzzleTuning/SSL_structures/UtnetV2/utnetv2.py @@ -0,0 +1,77 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +# from .utils import get_block +from .utnetv2_utils import Down_block, Up_block, inconv, SemanticMapFusion +import pdb + +from .conv_layers import BasicBlock, Bottleneck, SingleConv, MBConv, FusedMBConv, ConvNeXtBlock + +def get_block(name): + block_map = { + 'SingleConv': SingleConv, + 'BasicBlock': BasicBlock, + 'Bottleneck': Bottleneck, + 'MBConv': MBConv, + 'FusedMBConv': FusedMBConv, + 'ConvNeXtBlock': ConvNeXtBlock + } + return block_map[name] + + +class UTNetV2(nn.Module): + + def __init__(self, in_chan, num_classes, base_chan=32, map_size=8, conv_block='BasicBlock', conv_num=[2,1,0,0, 0,1,2,2], trans_num=[0,1,2,2, 2,1,0,0], num_heads=[1,4,8,16, 8,4,1,1], fusion_depth=2, fusion_dim=512, fusion_heads=16, expansion=4, attn_drop=0., proj_drop=0., proj_type='depthwise', norm=nn.BatchNorm2d, act=nn.GELU): + super().__init__() + + + chan_num = [2*base_chan, 4*base_chan, 8*base_chan, 16*base_chan, + 8*base_chan, 4*base_chan, 2*base_chan, base_chan] # [64, 128, 256, 512, 256, 128, 64, 32] + dim_head = [chan_num[i]//num_heads[i] for i in range(8)] # [64, 32, 32, 32, 32, 32, 64, 32] + conv_block = get_block(conv_block) # BasicBlock + + # self.inc and self.down1 forms the conv stem + self.inc = inconv(in_chan, base_chan, norm=norm, act=act) + self.down1 = Down_block(base_chan, chan_num[0], conv_num[0], trans_num[0], conv_block, norm=norm, act=act, map_generate=False, map_proj=False) + # self.down1 = down_block(32, 64, 2, 0, basicblock, batchnorm, gelu, False, False) + + # down2 down3 down4 apply the B-MHA blocks + self.down2 = Down_block(chan_num[0], chan_num[1], conv_num[1], trans_num[1], conv_block, heads=num_heads[1], dim_head=dim_head[1], expansion=expansion, attn_drop=attn_drop, proj_drop=proj_drop, map_size=map_size, proj_type=proj_type, norm=norm, act=act, map_generate=True, map_proj=False) + self.down3 = Down_block(chan_num[1], chan_num[2], conv_num[2], trans_num[2], conv_block, heads=num_heads[2], dim_head=dim_head[2], expansion=expansion, attn_drop=attn_drop, proj_drop=proj_drop, map_size=map_size, proj_type=proj_type, norm=norm, act=act, map_generate=False, map_proj=True) + self.down4 = Down_block(chan_num[2], chan_num[3], conv_num[3], trans_num[3], conv_block, heads=num_heads[3], dim_head=dim_head[3], expansion=expansion, attn_drop=attn_drop, proj_drop=proj_drop, map_size=map_size, proj_type=proj_type, norm=norm, act=act, map_generate=False, map_proj=True) + + + self.map_fusion = SemanticMapFusion(chan_num[1:4], fusion_dim, fusion_heads, depth=fusion_depth, norm=norm) + + + self.up1 = Up_block(chan_num[3], chan_num[4], conv_num[4], trans_num[4], conv_block, heads=num_heads[4], dim_head=dim_head[4], expansion=expansion, attn_drop=attn_drop, proj_drop=proj_drop, map_size=map_size, proj_type=proj_type, norm=norm, act=act, map_shortcut=True) + self.up2 = Up_block(chan_num[4], chan_num[5], conv_num[5], trans_num[5], conv_block, heads=num_heads[5], dim_head=dim_head[5], expansion=expansion, attn_drop=attn_drop, proj_drop=proj_drop, map_size=map_size, proj_type=proj_type, norm=norm, act=act, map_shortcut=True) + + # up3 up4 form the conv decoder + self.up3 = Up_block(chan_num[5], chan_num[6], conv_num[6], trans_num[6], conv_block, norm=norm, act=act, map_shortcut=False) + self.up4 = Up_block(chan_num[6], chan_num[7], conv_num[7], trans_num[7], conv_block, norm=norm, act=act, map_shortcut=False) + + + self.outc = nn.Conv2d(chan_num[7], num_classes, kernel_size=1) + + def forward(self, x): + # print('x: ', x.shape) + x0 = self.inc(x) # (3, 480, 480) -> (32, 480, 480) + x1, _ = self.down1(x0) + x2, map2 = self.down2(x1, None) + x3, map3 = self.down3(x2, map2) + x4, map4 = self.down4(x3, map3) + + map_list = [map2, map3, map4] + map_list = self.map_fusion(map_list) + + out, semantic_map = self.up1(x4, x3, map_list[2], map_list[1]) + out, semantic_map = self.up2(out, x2, semantic_map, map_list[0]) + out, semantic_map = self.up3(out, x1, semantic_map, None) + out, semantic_map = self.up4(out, x0, semantic_map, None) + + out = self.outc(out) + + return out + diff --git a/PuzzleTuning/SSL_structures/UtnetV2/utnetv2_utils.py b/PuzzleTuning/SSL_structures/UtnetV2/utnetv2_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9f35a7d85ce7544a76b9472048165a6b40b20664 --- /dev/null +++ b/PuzzleTuning/SSL_structures/UtnetV2/utnetv2_utils.py @@ -0,0 +1,362 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from .conv_layers import DepthwiseSeparableConv, BasicBlock, Bottleneck, MBConv, FusedMBConv, ConvNormAct +from .trans_layers import TransformerBlock + +from einops import rearrange +import pdb + + +class BidirectionAttention(nn.Module): + def __init__(self, feat_dim, map_dim, out_dim, heads=4, dim_head=64, attn_drop=0., + proj_drop=0., map_size=16, proj_type='depthwise'): + super().__init__() + + self.inner_dim = dim_head * heads + self.feat_dim = feat_dim + self.map_dim = map_dim + self.heads = heads + self.scale = dim_head ** (-0.5) + self.dim_head = dim_head + self.map_size = map_size + + assert proj_type in ['linear', 'depthwise'] + + if proj_type == 'linear': + self.feat_qv = nn.Conv2d(feat_dim, self.inner_dim*2, kernel_size=1, bias=False) + self.feat_out = nn.Conv2d(self.inner_dim, out_dim, kernel_size=1, bias=False) + + else: + self.feat_qv = DepthwiseSeparableConv(feat_dim, self.inner_dim * 2) + self.feat_out = DepthwiseSeparableConv(self.inner_dim, out_dim) + + self.map_qv = nn.Conv2d(map_dim, self.inner_dim*2, kernel_size=1, bias=False) + self.map_out = nn.Conv2d(self.inner_dim, map_dim, kernel_size=1, bias=False) + + self.attn_drop = nn.Dropout(attn_drop) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, feat, semantic_map): + + B, C, H, W = feat.shape + + feat_q, feat_v = self.feat_qv(feat).chunk(2, dim=1) # B, inner_dim, H, W + map_q, map_v = self.map_qv(semantic_map).chunk(2, dim=1) # B, inner_dim, rs, rs + + feat_q, feat_v = map(lambda t: rearrange(t, 'b (dim_head heads) h w -> b heads (h w) dim_head', dim_head = self.dim_head, heads=self.heads, h=H, w=W), [feat_q, feat_v]) + map_q, map_v = map(lambda t: rearrange(t, 'b (dim_head heads) h w -> b heads (h w) dim_head', dim_head=self.dim_head, heads=self.heads, h=self.map_size, w=self.map_size), [map_q, map_v]) + + attn = torch.einsum('bhid,bhjd->bhij', feat_q, map_q) + attn *= self.scale + + feat_map_attn = F.softmax(attn, dim=-1) # semantic map is very concise that don't need dropout + # add dropout migth cause unstable during training + map_feat_attn = self.attn_drop(F.softmax(attn, dim=-2)) + + feat_out = torch.einsum('bhij,bhjd->bhid', feat_map_attn, map_v) + feat_out = rearrange(feat_out, 'b heads (h w) dim_head -> b (dim_head heads) h w', h=H, w=W, dim_head=self.dim_head, heads=self.heads) + + map_out = torch.einsum('bhji,bhjd->bhid', map_feat_attn, feat_v) + map_out = rearrange(map_out, 'b heads (h w) dim_head -> b (dim_head heads) h w', b=B, dim_head=self.dim_head, heads=self.heads, h=self.map_size, w=self.map_size) + + feat_out = self.proj_drop(self.feat_out(feat_out)) + map_out = self.proj_drop(self.map_out(map_out)) + + return feat_out, map_out + + +class BidirectionAttentionBlock(nn.Module): + def __init__(self, feat_dim, map_dim, out_dim, heads, dim_head, norm=nn.BatchNorm2d, + act=nn.GELU, expansion=4, attn_drop=0., proj_drop=0., map_size=8, + proj_type='depthwise'): + super().__init__() + + assert norm in [nn.BatchNorm2d, nn.InstanceNorm2d, True, False] + assert act in [nn.ReLU, nn.ReLU6, nn.GELU, nn.SiLU, True, False] + assert proj_type in ['linear', 'depthwise'] + + self.norm1 = norm(feat_dim) if norm else nn.Identity() # norm layer for feature map + self.norm2 = norm(map_dim) if norm else nn.Identity() # norm layer for semantic map + + + self.attn = BidirectionAttention(feat_dim, map_dim, out_dim, heads=heads, dim_head=dim_head, attn_drop=attn_drop, proj_drop=proj_drop, map_size=map_size, proj_type=proj_type) + + self.shortcut = nn.Sequential() + if feat_dim != out_dim: + self.shortcut = ConvNormAct(feat_dim, out_dim, kernel_size=1, padding=0, norm=norm, act=act, preact=True) + + + if proj_type == 'linear': + self.feedforward = FusedMBConv(out_dim, out_dim, expansion=expansion, kernel_size=1, act=act, norm=norm) # 2 conv1x1 + else: + self.feedforward = MBConv(out_dim, out_dim, expansion=expansion, kernel_size=3, act=act, norm=norm, p=proj_drop) # depthwise conv + + def forward(self, x, semantic_map): + + feat = self.norm1(x) + mapp = self.norm2(semantic_map) + + out, mapp = self.attn(feat, mapp) + + out += self.shortcut(x) + out = self.feedforward(out) + + mapp += semantic_map + + return out, mapp + +class PatchMerging(nn.Module): + """ + Modified patch merging layer that works as down-sampling + """ + + def __init__(self, dim, out_dim, norm=nn.BatchNorm2d, proj_type='depthwise', map_proj=True): + super().__init__() + self.dim = dim # 32 + if proj_type == 'linear': + self.reduction = nn.Conv2d(4*dim, out_dim, kernel_size=1, bias=False) + else: + self.reduction = DepthwiseSeparableConv(4*dim, out_dim) # (32*4, 64) + + self.norm = norm(4*dim) + + if map_proj: + self.map_projection = nn.Conv2d(dim, out_dim, kernel_size=1, bias=False) + # (32, 64, kernel_size, bias) + + def forward(self, x, semantic_map=None): + """ + x: B, C, H, W + """ + x0 = x[:, :, 0::2, 0::2] + x1 = x[:, :, 1::2, 0::2] + x2 = x[:, :, 0::2, 1::2] + x3 = x[:, :, 1::2, 1::2] + + x = torch.cat([x0, x1, x2, x3], 1) # B, 4C, H, W + + x = self.norm(x) + x = self.reduction(x) # depthwise + pointwise 4C -> outdim + + if semantic_map is not None: + semantic_map = self.map_projection(semantic_map) # dim -> outdim + + return x, semantic_map + +class BasicLayer(nn.Module): + """ + A basic transformer layer for one stage + No downsample of upsample operation in this layer, they are wraped in the down_block or up_block of UTNet + """ + + def __init__(self, feat_dim, map_dim, out_dim, num_blocks, heads=4, dim_head=64, expansion=1, attn_drop=0., proj_drop=0., map_size=8, proj_type='depthwise', norm=nn.BatchNorm2d, act=nn.GELU): + super().__init__() + + dim1 = feat_dim + dim2 = out_dim + + self.blocks = nn.ModuleList([]) + for i in range(num_blocks): + self.blocks.append(BidirectionAttentionBlock(dim1, map_dim, dim2, heads, dim_head, expansion=expansion, attn_drop=attn_drop, proj_drop=proj_drop, map_size=map_size, proj_type=proj_type, norm=norm, act=act)) + dim1 = out_dim + + def forward(self, x, semantic_map): + for block in self.blocks: + x, semantic_map = block(x, semantic_map) + + return x, semantic_map + +class SemanticMapGeneration(nn.Module): + def __init__(self, feat_dim, map_dim, map_size): # (64, 64, 8) + super().__init__() + + self.map_size = map_size # 8 + self.map_dim = map_dim # 64 + + self.map_code_num = map_size * map_size # 8*8=64 + + self.base_proj = nn.Conv2d(feat_dim, map_dim, kernel_size=3, padding=1, bias=False) + # (64, 64, 3, 1, false) + self.semantic_proj = nn.Conv2d(feat_dim, self.map_code_num, kernel_size=3, padding=1, bias=False) + # (64, 64, 3, 1 false) + + + + def forward(self, x): + B, C, H, W = x.shape # B, C, H, W + feat = self.base_proj(x) # B, map_dim, h, w + weight_map = self.semantic_proj(x) # B, map_code_num, h, w + + weight_map = weight_map.view(B, self.map_code_num, -1) + weight_map = F.softmax(weight_map, dim=2) # B, map_code_num, hw + feat = feat.view(B, self.map_dim, -1) # B, map_dim, hw + + semantic_map = torch.einsum('bij,bkj->bik', feat, weight_map) + + return semantic_map.view(B, self.map_dim, self.map_size, self.map_size) + + +class SemanticMapFusion(nn.Module): + def __init__(self, in_dim_list, dim, heads, depth=1, norm=nn.BatchNorm2d): + super().__init__() + + + self.dim = dim + + # project all maps to the same channel num + self.in_proj = nn.ModuleList([]) + for i in range(len(in_dim_list)): + self.in_proj.append(nn.Conv2d(in_dim_list[i], dim, kernel_size=1, bias=False)) + + self.fusion = TransformerBlock(dim, depth, heads, dim//heads, dim, attn_drop=0., proj_drop=0.) + + # project all maps back to their origin channel num + self.out_proj = nn.ModuleList([]) + for i in range(len(in_dim_list)): + self.out_proj.append(nn.Conv2d(dim, in_dim_list[i], kernel_size=1, bias=False)) + + + + def forward(self, map_list): + B, _, H, W = map_list[0].shape + proj_maps = [self.in_proj[i](map_list[i]).view(B, self.dim, -1).permute(0, 2, 1) for i in range(len(map_list))] + # B, L, C where L=HW + + proj_maps = torch.cat(proj_maps, dim=1) + + attned_maps = self.fusion(proj_maps) + + attned_maps = attned_maps.chunk(len(map_list), dim=1) + + maps_out = [self.out_proj[i](attned_maps[i].permute(0, 2, 1).view(B, self.dim, H, W)) for i in range(len(map_list))] + + return maps_out + + + + + + + + + + + + + +####################################################################### +# UTNet block that for one stage, which contains conv block and trans block + + +class inconv(nn.Module): + def __init__(self, in_ch, out_ch, block=BasicBlock, norm=nn.BatchNorm2d, act=nn.GELU): + super().__init__() + self.conv1 = nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1, bias=False) + + self.conv2 = block(out_ch, out_ch, norm=norm, act=act) + + def forward(self, x): + if x.shape == 5: + x = x.squeeze(1) + out = self.conv1(x) # (3, 480, 480) -> (32, 480, 480] + out = self.conv2(out) # block (32, 32, norm, act) conv norm relu 残差 + + return out + + + + +class Down_block(nn.Module): + def __init__(self, in_ch, out_ch, conv_num, trans_num, conv_block=BasicBlock, + heads=4, dim_head=64, expansion=4, attn_drop=0., proj_drop=0., map_size=8, + proj_type='depthwise', norm=nn.BatchNorm2d, act=nn.GELU, map_generate=False, + map_proj=True, map_dim=None): + # (32, 64, 2, 0, basicblock, batchnorm, gelu, False, False) + super().__init__() + + map_dim = out_ch if map_dim is None else map_dim # 64 + self.map_generate = map_generate # False + if map_generate: + self.map_gen = SemanticMapGeneration(out_ch, map_dim, map_size) + # return semantic_map.view(B, self.map_dim, self.map_size, self.map_size) + + self.patch_merging = PatchMerging(in_ch, out_ch, proj_type=proj_type, norm=norm, map_proj=map_proj) + # in_ch->out_ch + block_list = [] + for i in range(conv_num): # 2 + block_list.append(conv_block(out_ch, out_ch, norm=norm, act=act)) + dim1 = out_ch + + self.conv_blocks = nn.Sequential(*block_list) + + self.trans_blocks = BasicLayer(out_ch, map_dim, out_ch, num_blocks=trans_num, \ + heads=heads, dim_head=dim_head, norm=norm, act=act, expansion=expansion,\ + attn_drop=attn_drop, proj_drop=proj_drop, map_size=map_size, proj_type=proj_type) + + + def forward(self, x, semantic_map=None): + + x, semantic_map = self.patch_merging(x, semantic_map) # in_ch->out_chan + + out = self.conv_blocks(x) # out->out + if self.map_generate: + semantic_map = self.map_gen(out) # (B, self.map_dim, self.map_size, self.map_size)) + + out, semantic_map = self.trans_blocks(out, semantic_map) + + return out, semantic_map + +class Up_block(nn.Module): + def __init__(self, in_ch, out_ch, conv_num, trans_num, conv_block=BasicBlock, + heads=4, dim_head=64, expansion=1, attn_drop=0., proj_drop=0., map_size=8, + proj_type='linear', norm=nn.BatchNorm2d, act=nn.GELU, map_dim=None, + map_shortcut=False): + super().__init__() + + self.reduction = nn.Conv2d(in_ch+out_ch, out_ch, kernel_size=1, padding=0, bias=False) + self.norm = norm(in_ch+out_ch) + + self.map_shortcut = map_shortcut + map_dim = out_ch if map_dim is None else map_dim + if map_shortcut: + self.map_reduction = nn.Conv2d(in_ch+out_ch, map_dim, kernel_size=1, bias=False) + else: + self.map_reduction = nn.Conv2d(in_ch, map_dim, kernel_size=1, bias=False) + + + + self.trans_blocks = BasicLayer(out_ch, map_dim, out_ch, num_blocks=trans_num, \ + heads=heads, dim_head=dim_head, norm=norm, act=act, expansion=expansion,\ + attn_drop=attn_drop, proj_drop=proj_drop, map_size=map_size, proj_type=proj_type) + + conv_list = [] + for i in range(conv_num): + conv_list.append(conv_block(out_ch, out_ch, norm=norm, act=act)) + + self.conv_blocks = nn.Sequential(*conv_list) + + def forward(self, x1, x2, map1, map2=None): + # x1: low-res feature, x2: high-res feature + # map1: semantic map from previous low-res layer + # map2: semantic map from encoder shortcut path, might be none if we don't have the map from encoder + + + x1 = F.interpolate(x1, size=x2.shape[-2:], mode='bilinear', align_corners=True) + feat = torch.cat([x1, x2], dim=1) + + out = self.reduction(self.norm(feat)) + + if self.map_shortcut and map2 is not None: + semantic_map = torch.cat([map1, map2], dim=1) + else: + semantic_map = map1 + semantic_map = self.map_reduction(semantic_map) + + out, semantic_map = self.trans_blocks(out, semantic_map) + out = self.conv_blocks(out) + + return out, semantic_map + + diff --git a/PuzzleTuning/SSL_structures/engine_pretrain.py b/PuzzleTuning/SSL_structures/engine_pretrain.py new file mode 100644 index 0000000000000000000000000000000000000000..a0fba243d2cbbac664004b2029c58349a5e42773 --- /dev/null +++ b/PuzzleTuning/SSL_structures/engine_pretrain.py @@ -0,0 +1,157 @@ +""" +Training Engine Script ver: Feb 8th 16:00 + +Based on MAE code. +https://github.com/facebookresearch/mae + +""" + +import math +import sys +from typing import Iterable +import os +import torch +from torchvision.transforms import ToPILImage +import SSL_structures.misc as misc +import utils.schedulers as lr_sched +from utils.visual_usage import unpatchify, patchify, Draw_tri_fig + + +def train_one_epoch(model: torch.nn.Module, + data_loader: Iterable, optimizer: torch.optim.Optimizer, + device: torch.device, epoch: int, loss_scaler, fix_position_ratio_scheduler=None, + puzzle_patch_size_scheduler=None, check_samples=1, print_freq=20, log_writer=None, args=None): + model.train(True) + + # update logger + metric_logger = misc.MetricLogger(delimiter=" ") + # 初始化学习率记录 + metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}')) + + header = 'Epoch: [{}]'.format(epoch) + + accum_iter = args.accum_iter + + optimizer.zero_grad() + + if log_writer is not None: # Tensorboard PATH + print('log_dir: {}'.format(args.log_dir)) + + # Iteration + for data_iter_step, (samples, _) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): + + # per iteration lr scheduler基于中间epoch位置 + # 来实现更精确的调节学习率:data_iter_step / len(data_loader) + epoch + if data_iter_step % accum_iter == 0: + lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args) + + # 拿数据 + samples = samples.to(device, non_blocking=True) + + with torch.cuda.amp.autocast(): # 使用自动混合精度加速训练 + + if fix_position_ratio_scheduler is not None and puzzle_patch_size_scheduler is not None: # SAE + fix_position_ratio = fix_position_ratio_scheduler(epoch) + puzzle_patch_size = puzzle_patch_size_scheduler(epoch) + else: + fix_position_ratio = None + puzzle_patch_size = None + + if args.model[0:3] == 'sae': + loss, pred, imgs_puzzled_patches = model(samples, fix_position_ratio=fix_position_ratio, + puzzle_patch_size=puzzle_patch_size) # SAE + else: # args.model[0:3] == 'mae' + loss, pred, mask_patch_indicators = model(samples, mask_ratio=args.mask_ratio) # MAE + # fixme mae curriculum maybe not good enough for future + if args.DDP_distributed: + loss_value = loss.item() + else: + loss_value = float(loss.cpu().detach().numpy()) \ + if torch.cuda.device_count() == 1 else sum(loss.cpu().detach().numpy()) + + if not math.isfinite(loss_value): # 检查确保没有loss爆炸 + print("Loss is {}, stopping training".format(loss_value)) + sys.exit(1) + + loss = loss / accum_iter # 计算的是每个minibatch的loss,如果有梯度累加则需要减少占比,loss在loss_scaler里面会进行叠加 + + # loss backward 核心(不要怕,其实就是功能上集成了loss.backward+opt.step,然后引入了梯度裁剪) + loss_scaler(loss, optimizer, parameters=model.parameters(), + update_grad=(data_iter_step + 1) % accum_iter == 0) + + if (data_iter_step + 1) % accum_iter == 0: + optimizer.zero_grad() + + torch.cuda.synchronize() # 等待当前设备上所有流中的所有核心完成 + + # 更新记录 + metric_logger.update(loss=loss_value) + lr = optimizer.param_groups[0]["lr"] + metric_logger.update(lr=lr) + + # 计算平均在单卡上的loss + loss_value_reduce = misc.all_reduce_mean(loss_value) + + if log_writer is not None: + log_writer.add_scalar('train_loss', loss_value_reduce, epoch) + log_writer.add_scalar('lr', lr, epoch) + + if fix_position_ratio is not None and puzzle_patch_size is not None: + log_writer.add_scalar('puzzle_patch_size', puzzle_patch_size, epoch) + log_writer.add_scalar('fix_position_ratio', fix_position_ratio, epoch) + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + if fix_position_ratio is not None and puzzle_patch_size is not None: + print("Averaged stats:", metric_logger, 'fix_position_ratio:', fix_position_ratio, + ' puzzle_patch_size:', puzzle_patch_size) + else: + print("Averaged stats:", metric_logger) + + # TODO: currently, only paint at the end of each epoch Train, + if args.model[0:3] == 'sae': + imgs_puzzled_batch = unpatchify(imgs_puzzled_patches, patch_size=16) + else: # MAE + sample_img_patches = patchify(samples, patch_size=16) # on GPU + masked_img_patches = sample_img_patches * \ + mask_patch_indicators.unsqueeze(-1).expand(-1, -1, + sample_img_patches.shape[-1]) + masked_img_batch = unpatchify(masked_img_patches, patch_size=16) + + # paint images at the end of each epoch on main process + if misc.is_main_process(): + for sampleIDX in range(check_samples): + + sample_img = samples.cpu()[sampleIDX] + sample_img = ToPILImage()(sample_img) + sample_img.save(os.path.join(args.output_dir, 'figs', 'sample_e_' + str(epoch) + + '_sampleIDX_' + str(sampleIDX) + '.jpg')) + + recons_img_batch = unpatchify(pred, patch_size=16) + recons_img = recons_img_batch.cpu()[sampleIDX] + recons_img = ToPILImage()(recons_img) + recons_img.save(os.path.join(args.output_dir, 'figs', 'recons_e_' + str(epoch) + + '_sampleIDX_' + str(sampleIDX) + '.jpg')) + + if args.model[0:3] == 'sae': # SAE + puzzled_img = imgs_puzzled_batch.cpu()[sampleIDX] + puzzled_img = ToPILImage()(puzzled_img) + puzzled_img.save(os.path.join(args.output_dir, 'figs', 'puzzled_e_' + str(epoch) + + '_sampleIDX_' + str(sampleIDX) + '.jpg')) + + picpath = os.path.join(args.output_dir, 'figs', 'puzzled_e_' + str(epoch) + + '_sampleIDX_' + str(sampleIDX) + '.jpg') + Draw_tri_fig(sample_img, puzzled_img, recons_img, picpath) + + else: # MAE + masked_img = masked_img_batch.cpu()[sampleIDX] # put on CPU + masked_img = ToPILImage()(masked_img) + masked_img.save(os.path.join(args.output_dir, 'figs', 'masked_e_' + str(epoch) + + '_sampleIDX_' + str(sampleIDX) + '.jpg')) + + picpath = os.path.join(args.output_dir, 'figs', 'masked_e_' + str(epoch) + + '_sampleIDX_' + str(sampleIDX) + '.jpg') + Draw_tri_fig(sample_img, masked_img, recons_img, picpath) + + # 返回记录,其他的已经在对象内迭代 + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} diff --git a/PuzzleTuning/SSL_structures/misc.py b/PuzzleTuning/SSL_structures/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..772e2fe06d85f06b610c8636df1e157434b07f66 --- /dev/null +++ b/PuzzleTuning/SSL_structures/misc.py @@ -0,0 +1,403 @@ +""" +pre-training funcs Script ver: Feb 8th 16:00 +有修改loss backward + +""" +import builtins +import datetime +import os +import time +from collections import defaultdict, deque +from pathlib import Path + +import torch +import torch.distributed as dist + +try: + from torch import inf +except: + from torch._six import inf + + +# SmoothedValue operator +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + if not is_dist_avail_and_initialized(): + return + t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') + dist.barrier() + dist.all_reduce(t) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value) + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) # SmoothedValue operator + self.delimiter = delimiter + + def update(self, **kwargs): # 更新内容字典 + for k, v in kwargs.items(): + if v is None: + continue + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): # 报错 + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): # 转换为str给print + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append( + "{}: {}".format(name, str(meter)) + ) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): # 多进程同步 + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): # 新增一个indicator元素 + self.meters[name] = meter + + def log_every(self, iterable, print_freq, header=None): # warp minibatch + # 初始化迭代idx + i = 0 + # 初始化头文件 + if not header: + header = '' + + # 初始化计时 + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt='{avg:.4f}') + data_time = SmoothedValue(fmt='{avg:.4f}') + space_fmt = ':' + str(len(str(len(iterable)))) + 'd' + # 初始化输出 + log_msg = [ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}' + ] + if torch.cuda.is_available(): + log_msg.append('max mem: {memory:.0f}') + + log_msg = self.delimiter.join(log_msg) # 缩进 + + MB = 1024.0 * 1024.0 + + for obj in iterable: + + data_time.update(time.time() - end) + yield obj # 生成迭代的下一个对象 + iter_time.update(time.time() - end) + + if i % print_freq == 0 or i == len(iterable) - 1: + # 估算时间 + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + # 输出 + if torch.cuda.is_available(): + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB)) + else: + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time))) + i += 1 + end = time.time() + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('{} Total time: {} ({:.4f} s / it)'.format( + header, total_time_str, total_time / len(iterable))) + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + builtin_print = builtins.print + + def print(*args, **kwargs): + force = kwargs.pop('force', False) + force = force or (get_world_size() > 8) + if is_master or force: + now = datetime.datetime.now().time() + builtin_print('[{}] '.format(now), end='') # print with time stamp + builtin_print(*args, **kwargs) + + builtins.print = print + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(*args, **kwargs): + if is_main_process(): + torch.save(*args, **kwargs) + + +def init_distributed_mode(args): + """ + 配置多服务器环境文件信息,安排args.DDP_distributed + + :param args: + :return: + """ + if args.dist_on_itp: + args.rank = int(os.environ['OMPI_COMM_WORLD_RANK']) + args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) + args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) + args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT']) + os.environ['LOCAL_RANK'] = str(args.gpu) + os.environ['RANK'] = str(args.rank) + os.environ['WORLD_SIZE'] = str(args.world_size) + # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"] + + elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ['WORLD_SIZE']) + args.gpu = int(os.environ['LOCAL_RANK']) + + elif 'SLURM_PROCID' in os.environ: + args.rank = int(os.environ['SLURM_PROCID']) + args.gpu = args.rank % torch.cuda.device_count() + + else: + print('Not using DDP_distributed mode') + setup_for_distributed(is_master=True) # hack + args.DDP_distributed = False + return + + args.DDP_distributed = True + + torch.cuda.set_device(args.gpu) + args.dist_backend = 'nccl' + print('| DDP_distributed init (rank {}): {}, gpu {}'.format( + args.rank, args.dist_url, args.gpu), flush=True) + torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + torch.distributed.barrier() + setup_for_distributed(args.rank == 0) + + +class NativeScalerWithGradNormCount: + """ + 定义的 loss 优化器 + + 基于自动混合精度训练设置的loss_scaler,额外增加了梯度裁剪的功能 + """ + state_dict_key = "amp_scaler" + + def __init__(self, GPU_count=1, DDP_distributed=False): + self._scaler = torch.cuda.amp.GradScaler() + self.GPU_count = GPU_count + self.DDP_distributed=DDP_distributed + + def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True): + + # 反传 + if self.DDP_distributed: + loss = loss.unsqueeze(-1) + self._scaler.scale(loss).backward(loss, create_graph=create_graph) # create_graph + else: + if self.GPU_count == 1: # only one GPU + loss = loss.unsqueeze(-1) # fixme 加了expand解决梯度标量问题,原本设计为了多卡,多卡有形状,单卡变没有形状的标量了 + # fixme 加了ones_like不知道为啥存在, 可能原本是分布式多个word + self._scaler.scale(loss).backward(torch.ones_like(loss), create_graph=create_graph) # create_graph + + if update_grad: + # 梯度裁剪 + if clip_grad is not None: + assert parameters is not None + self._scaler.unscale_(optimizer) # unscale the gradients of optimizer's assigned params in-place + norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad) + else: + self._scaler.unscale_(optimizer) + norm = get_grad_norm_(parameters) + + self._scaler.step(optimizer) # 使用optimizer更新模型 + + self._scaler.update() + else: + norm = None + + return norm + + def state_dict(self): # 记录loss_scaler的state_dict,应该就是保存梯度 + return self._scaler.state_dict() + + def load_state_dict(self, state_dict): # 还原某个checkpoint的state_dict + self._scaler.load_state_dict(state_dict) + + +def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor: + + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + + # 确定需要梯度的模型参数 + parameters = [p for p in parameters if p.grad is not None] + norm_type = float(norm_type) + + if len(parameters) == 0: + return torch.tensor(0.) + + # 从对应GPU上进行操作 + device = parameters[0].grad.device + + if norm_type == inf: + # 面对norm_type == inf爆炸值,保留 + total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters) + else: + # 无norm_type == inf爆炸值,做norm + total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) + + return total_norm + + +def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler, model_idx='SAE_'): + output_dir = Path(args.output_dir) + epoch_name = str(epoch) + + if loss_scaler is not None: + checkpoint_paths = [output_dir / (model_idx+'_checkpoint-%s.pth' % epoch_name)] + for checkpoint_path in checkpoint_paths: + to_save = { + 'model': model_without_ddp.state_dict(), + 'optimizer': optimizer.state_dict(), + 'epoch': epoch, + 'scaler': loss_scaler.state_dict(), + 'args': args, # 保存配置参数,但是在加载的时候不加载 + } + + save_on_master(to_save, checkpoint_path) + else: + client_state = {'epoch': epoch} + model.save_checkpoint(save_dir=args.output_dir, tag="checkpoint-%s" % epoch_name, client_state=client_state) + + +def load_model(args, model_without_ddp, optimizer, loss_scaler): + + # 加载配置checkpoint的路径args.resume,默认没有则不加载 + if args.resume: + if args.resume.startswith('https'): + checkpoint = torch.hub.load_state_dict_from_url( + args.resume, map_location='cpu', check_hash=True) + else: + checkpoint = torch.load(args.resume, map_location='cpu') + + model_without_ddp.load_state_dict(checkpoint['model']) + + print("Resume checkpoint %s" % args.resume) + + if 'optimizer' in checkpoint and 'epoch' in checkpoint and not (hasattr(args, 'eval') and args.eval): + optimizer.load_state_dict(checkpoint['optimizer']) + args.start_epoch = checkpoint['epoch'] + 1 + + if 'scaler' in checkpoint: + loss_scaler.load_state_dict(checkpoint['scaler']) + + print("With optim & sched!") + + +# 计算平均在单卡上的loss +def all_reduce_mean(x): + world_size = get_world_size() + + if world_size > 1: + x_reduce = torch.tensor(x).cuda() + dist.all_reduce(x_reduce) + x_reduce /= world_size + return x_reduce.item() + + else: + return x \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/models_mae.py b/PuzzleTuning/SSL_structures/models_mae.py new file mode 100644 index 0000000000000000000000000000000000000000..b5ffb6a8c96b13bdab4de57ecdd9ba25e580e3f8 --- /dev/null +++ b/PuzzleTuning/SSL_structures/models_mae.py @@ -0,0 +1,665 @@ +""" +MAE Model Script ver: Oct 23rd 15:00 + +# References: +Based on MAE code. +https://github.com/facebookresearch/mae + +timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm +DeiT: https://github.com/facebookresearch/deit + + +July 16th +Add patchify_decoder to form B,N,D +Add a parameter for MAE to import segmentation network +""" +from functools import partial + +import torch +import torch.nn as nn + +from timm.models.vision_transformer import PatchEmbed, Block +from Backbone.VPT_structure import VPT_ViT +from SSL_structures.pos_embed import get_2d_sincos_pos_embed + + +class MaskedAutoencoderViT(VPT_ViT): + """ + Masked Autoencoder with VisionTransformer backbone + """ + + def __init__(self, img_size=224, patch_size=16, in_chans=3, + embed_dim=1024, depth=24, num_heads=16, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4., norm_layer=nn.LayerNorm, norm_pix_loss=False, + prompt_mode=None, Prompt_Token_num=20, basic_state_dict=None, decoder=None, decoder_rep_dim=None): + + # model = MaskedAutoencoderViT( + # patch_size=16, embed_dim=768, depth=12, num_heads=12, + # decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + # mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + + if prompt_mode is None: + super().__init__() + # MAE encoder specifics (this part just the same as ViT) + # -------------------------------------------------------------------------- + self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim) # BCHW -> BNC + num_patches = self.patch_embed.num_patches + + # learnable cls token is still used but on cls head need + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + # set and freeze encoder_pos_embed, use the fixed sin-cos embedding for tokens + mask_token + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim), requires_grad=False) + # Encoder blocks + self.blocks = nn.ModuleList([ # qk_scale=None fixme related to timm version + Block(embed_dim, num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) + + self.prompt_mode = prompt_mode + # -------------------------------------------------------------------------- + + else: + super().__init__(img_size=img_size, patch_size=patch_size, in_chans=in_chans, + embed_dim=embed_dim, depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, + norm_layer=norm_layer, Prompt_Token_num=Prompt_Token_num, VPT_type=prompt_mode, + basic_state_dict=None) # Firstly, set then Encoder state_dict to none here. + num_patches = self.patch_embed.num_patches # set patch_embed of VPT + # set and freeze encoder_pos_embed, use the fixed sin-cos embedding for tokens + mask_token + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim), requires_grad=False) + + self.prompt_mode = prompt_mode + # Freeze Encoder parameters except of the Prompt Tokens + self.Freeze() + + # MAE decoder specifics + # -------------------------------------------------------------------------- + # if the feature dimension of encoder and decoder are different, use decoder_embed to align them + if embed_dim != decoder_embed_dim: + self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim, bias=True) + else: + self.decoder_embed = nn.Identity() + + if decoder is not None: + self.decoder = decoder + # set mask_token (learnable mask token for reconstruction) + self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + # Decoder use a FC to reconstruct image, unlike the Encoder which use a CNN to split patch + self.decoder_pred = nn.Linear(decoder_rep_dim, patch_size ** 2 * in_chans, bias=True) # decoder to patch + + else: + self.decoder = None # 未传入decoder则与encoder流程一致,但是更改了通道数量,构建block(原版MAE) + # set mask_token (learnable mask token for reconstruction) + self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim)) + + # set and freeze decoder_pos_embed, use the fixed sin-cos embedding for tokens + mask_token + self.decoder_pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, decoder_embed_dim), + requires_grad=False) + self.decoder_blocks = nn.ModuleList([Block(decoder_embed_dim, decoder_num_heads, mlp_ratio, + qkv_bias=True, norm_layer=norm_layer) + for i in range(decoder_depth)]) + # qk_scale=None fixme related to timm version + self.decoder_norm = norm_layer(decoder_embed_dim) + + # Decoder use a FC to reconstruct image, unlike the Encoder which use a CNN to split patch + self.decoder_pred = nn.Linear(decoder_embed_dim, patch_size ** 2 * in_chans, bias=True) # decoder to patch + + # -------------------------------------------------------------------------- + # wether or not to use norm_pix_loss + self.norm_pix_loss = norm_pix_loss + # parameter initialization + self.initialize_weights() + + # load basic state_dict of backbone for Transfer-learning-based tuning + if basic_state_dict is not None: + self.load_state_dict(basic_state_dict, False) + + def initialize_weights(self): + # initialization + # initialize a 2d positional encoding of (embed_dim, grid) by sin-cos embedding + pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], + int(self.patch_embed.num_patches ** .5), + cls_token=True) + # return: pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0)) + + if self.decoder is None: + # initialize a 2d positional encoding of (embed_dim, grid) by sin-cos embedding + decoder_pos_embed = get_2d_sincos_pos_embed(self.decoder_pos_embed.shape[-1], + int(self.patch_embed.num_patches ** .5), + cls_token=True) + self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0)) + + # initialize patch_embed like nn.Linear (instead of nn.Conv2d) + w = self.patch_embed.proj.weight.data + torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) # xavier_uniform,让输入输出的方差相同,包括前后向传播 + + # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.) + torch.nn.init.normal_(self.cls_token, std=.02) + torch.nn.init.normal_(self.mask_token, std=.02) + + # initialize nn.Linear and nn.LayerNorm + self.apply(self._init_weights) + + def _init_weights(self, m): + # initialize nn.Linear and nn.LayerNorm + if isinstance(m, nn.Linear): + # we use xavier_uniform following official JAX ViT: + torch.nn.init.xavier_uniform_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def patchify(self, imgs): + """ + Encode image to patch tokens + + input: + imgs: (B, 3, H, W) + + output: + x: (B, num_patches, patch_size**2 *3) AKA [B, num_patches, flatten_dim] + """ + # patch_size + p = self.patch_embed.patch_size[0] + # assert H == W and image shape is dividedable by patch + assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0 + # patch num in rol or column + h = w = imgs.shape[2] // p + + # use reshape to split patch [B, C, H, W] -> [B, C, h_p, p, w_p, p] + x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p)) + # ReArrange dimensions [B, C, h_p, p, w_p, p] -> [B, h_p, w_p, p, p, C] + x = torch.einsum('nchpwq->nhwpqc', x) + # ReArrange dimensions [B, h_p, w_p, p, p, C] -> [B, num_patches, flatten_dim] + x = x.reshape(shape=(imgs.shape[0], h * w, p ** 2 * 3)) + return x + + def patchify_decoder(self, imgs, patch_size=None): # TODO 这里目的很大,需要实现预训练! + """ + Break image to patch tokens + + fixme,注意,这里patch_size应该是按照decoder的网络设置来作为default + + input: + imgs: (B, CLS, H, W) + + output: + x: (B, num_patches, -1) AKA [B, num_patches, -1] + """ + # patch_size + patch_size = self.patch_embed.patch_size[0] if patch_size is None else patch_size + + # assert H == W and image shape is divided-able by patch + assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % patch_size == 0 + # patch num in rol or column + h = w = imgs.shape[2] // patch_size + + # use reshape to split patch [B, C, H, W] -> [B, C, h_p, patch_size, w_p, patch_size] + x = imgs.reshape(shape=(imgs.shape[0], -1, h, patch_size, w, patch_size)) + + # ReArrange dimensions [B, C, h_p, patch_size, w_p, patch_size] -> [B, h_p, w_p, patch_size, patch_size, C] + x = torch.einsum('nchpwq->nhwpqc', x) + # ReArrange dimensions [B, h_p, w_p, patch_size, patch_size, C] -> [B, num_patches, flatten_dim] + x = x.reshape(shape=(imgs.shape[0], h * w, -1)) + return x + + def unpatchify(self, x, patch_size=None): + """ + Decoding encoded patch tokens + + input: + x: (B, num_patches, patch_size**2 *3) AKA [B, num_patches, flatten_dim] + + output: + imgs: (B, 3, H, W) + """ + # patch_size + p = self.patch_embed.patch_size[0] if patch_size is None else patch_size + + # squre root of num_patches(without CLS token required) + h = w = int(x.shape[1] ** .5) + # assert num_patches is without CLS token + assert h * w == x.shape[1] + + # ReArrange dimensions [B, num_patches, flatten_dim] -> [B, h_p, w_p, p, p, C] + x = x.reshape(shape=(x.shape[0], h, w, p, p, 3)) + # ReArrange dimensions [B, h_p, w_p, p, p, C] -> [B, C, h_p, p, w_p, p] + x = torch.einsum('nhwpqc->nchpwq', x) + # use reshape to compose patch [B, C, h_p, p, w_p, p] -> [B, C, H, W] + imgs = x.reshape(shape=(x.shape[0], 3, h * p, h * p)) + return imgs + + def random_masking(self, x, mask_ratio): + """ + Perform per-sample random masking by per-sample shuffling. + Per-sample shuffling is done by argsort random noise. + + 注意torch.argsort返回的是: + 在每个指定dim,按原tensor每个位置数值大小升序排列后,的原本位置的idx组成的矩阵 + + input: + x: [B, num_patches, D], sequence of Tokens + + output: x_remained, mask, ids_restore + x_remained: [B, num_patches * (1-mask_ratio), D], sequence of Tokens + mask: [B, num_patches], binary mask + ids_restore: [B, num_patches], idx of restoring all position + """ + B, num_patches, D = x.shape # batch, length, dim + # 计算需要保留的位置的个数 + len_keep = int(num_patches * (1 - mask_ratio)) + # 做一个随机序列[B,num_patches],用于做位置标号 + noise = torch.rand(B, num_patches, device=x.device) # noise in [0, 1] + + # 在Batch里面每个序列上获得noise tensor经过升序排列后原本位置的idx矩阵 在batch内进行升序排列 + ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove + # 再对idx矩阵继续升序排列可获得:原始noise tensor的每个位置的排序顺位 + ids_restore = torch.argsort(ids_shuffle, dim=1) + + # keep the first subset + ids_keep = ids_shuffle[:, :len_keep] + + # 设置需要的patch的索引 + # ids_keep.unsqueeze(-1).repeat(1, 1, D): + # [B,num_patches] -> [B,keep_patches] -> [B,keep_patches,1] 每个位置数字为idx of ori patch -> [B,keep_patches,D] + + # torch.gather 按照索引取值构建新tensor: x_remained [B,keep_patches,D] 表示被标记需要保留的位置, 原文是x_masked + x_remained = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D)) + + # generate the binary mask: 0 is keep, 1 is remove + mask = torch.ones([B, num_patches], device=x.device) + mask[:, :len_keep] = 0 # 设置mask矩阵,前len_keep个为0,后面为1 + + # 按照noise tensor每个位置的大小顺序,来设置mask符号为0的位置,获得mask矩阵 + mask = torch.gather(mask, dim=1, index=ids_restore) + + return x_remained, mask, ids_restore # x_remained原文是x_masked + + def forward_encoder(self, imgs, mask_ratio): + """ + :param imgs: [B, C, H, W], sequence of imgs + :param mask_ratio: mask_ratio + + :return: Encoder output: encoded tokens, mask position, restore idxs + x: [B, 1 + num_patches * (1-mask_ratio), D], sequence of Tokens (including the cls token) + mask: [B, num_patches], binary mask + ids_restore: [B, num_patches], idx of restoring all position + """ + if self.prompt_mode is None: # ViT + # embed patches + x = self.patch_embed(imgs) # BCHW -> BNC + + # add pos embed w/o cls token + x = x + self.pos_embed[:, 1:, :] # add pos embed before concatenate the cls token + + # masking: length -> length * (1-mask_ratio) + # x_remained: [B, num_patches * (1-mask_ratio), D], sequence of Tokens + x, mask, ids_restore = self.random_masking(x, mask_ratio) + + # append cls token + cls_token = self.cls_token + self.pos_embed[:, :1, :] + cls_tokens = cls_token.expand(x.shape[0], -1, -1) # batch fix 调整batch + x = torch.cat((cls_tokens, x), dim=1) + + # apply Transformer Encoders + for blk in self.blocks: + x = blk(x) + + else: # VPT + x = self.patch_embed(imgs) + # add pos embed before concatenate the cls token + x = x + self.pos_embed[:, 1:, :] + # masking: length -> length * (1-mask_ratio) + # x_remained: [B, num_patches * (1-mask_ratio), D], sequence of Tokens + x, mask, ids_restore = self.random_masking(x, mask_ratio) + + # append cls token + cls_token = self.cls_token + self.pos_embed[:, :1, :] + cls_tokens = cls_token.expand(x.shape[0], -1, -1) # batch fix 调整batch + x = torch.cat((cls_tokens, x), dim=1) + + if self.VPT_type == "Deep": + Prompt_Token_num = self.Prompt_Tokens.shape[1] + for i in range(len(self.blocks)): + # concatenate Prompt_Tokens + Prompt_Tokens = self.Prompt_Tokens[i].unsqueeze(0) + # firstly concatenate + x = torch.cat((x, Prompt_Tokens.expand(x.shape[0], -1, -1)), dim=1) + num_tokens = x.shape[1] + # lastly remove, a good trick + x = self.blocks[i](x)[:, :num_tokens - Prompt_Token_num] + + else: # self.VPT_type == "Shallow" + Prompt_Token_num = self.Prompt_Tokens.shape[1] + # concatenate Prompt_Tokens + Prompt_Tokens = self.Prompt_Tokens.expand(x.shape[0], -1, -1) + x = torch.cat((x, Prompt_Tokens), dim=1) + num_tokens = x.shape[1] + # A whole sequential process + x = self.blocks(x)[:, :num_tokens - Prompt_Token_num] + + # last norm of Transformer + x = self.norm(x) + + # Encoder output: encoded tokens, mask position, restore idxs + return x, mask, ids_restore + + def forward_decoder(self, x, ids_restore): + """ + :param x: [B, 1 + num_patches * (1-mask_ratio), D], sequence of Tokens (including the cls token) + :param ids_restore: restore idxs for torch.gather(mask, dim=1, index=ids_restore) + + :return: Decoder output: reconstracted tokens + x: [B, num_patches * (1-mask_ratio), D], sequence of Tokens + """ + if self.decoder is None: + # embed tokens: [B, num_encoded_tokens, embed_dim] -> [B, num_encoded_tokens, D_Decoder] + x = self.decoder_embed(x) # 更改适合的通道数 + + # append mask tokens to sequence as place holder: [B, num_patches + 1 - num_encoded_tokens, D_Decoder] + # number of mask token need is the requirement to fill the num_patches + mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1) + # 这里ids_restore.shape[1] + 1 - x.shape[1] 其实意思是ids_restore.shape[1] - (x.shape[1]-1), 因为不要CLS token + + # -> [B, num_patches, D_Decoder] + x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1) # stripe the cls token in Decoder for restore position + + # unshuffle to restore the position of tokens + x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2])) + # torch.gather 按照索引取值构建新tensor: x_ [B,num_patches,D_Decoder] 表示位置还原之后的图,此时数值还不对 + + # append back the cls token at the first -> [B,1+num_patches,D_Decoder] + x = torch.cat([x[:, :1, :], x_], dim=1) + + # add pos embed + x = x + self.decoder_pos_embed + + # apply Transformer blocks + for blk in self.decoder_blocks: + x = blk(x) + x = self.decoder_norm(x) + + # Reconstruction projection [B, num_patches, D_Decoder] -> [B, num_patches, p*p*3] + x = self.decoder_pred(x) + + # remove cls token + x = x[:, 1:, :] + + else: + # append mask tokens to sequence as place holder: [B, num_patches + 1 - num_encoded_tokens, D] + # number of mask token need is the requirement to fill the num_patches + mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1) + # 这里ids_restore.shape[1] + 1 - x.shape[1] 其实意思是ids_restore.shape[1] - (x.shape[1]-1), 因为不要CLS token + + # -> [B, num_patches, D] + x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1) # stripe the cls token in Decoder for restore position + + # unshuffle to restore the position of tokens + x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2])) + # torch.gather 按照索引取值构建新tensor: x_ [B,num_patches,D] 表示位置还原之后的图,此时数值还不对 + + # embed tokens: [B, num_encoded_tokens, D_Encoder] -> [B, num_encoded_tokens, D_Decoder] + x_ = self.decoder_embed(x_) + + # unpatchify to make image form [B, N, Enc] to [B,H,W,C] + x = self.unpatchify(x_) # restore image by Encoder + + # apply decoder module to segment the output of encoder + x = self.decoder(x) # [B, CLS, H, W] + # the output of segmentation is transformed to [B, N, Dec] + x = self.patchify_decoder(x) # TODO 做一个有意义的设计 + + # Convert the number of channels to match image for loss function + x = self.decoder_pred(x) # [B, N, Dec] -> [B, N, p*p*3] + + return x + + def forward_loss(self, imgs, pred, mask): # 通过把loss放到model里面,把model变成了一个训练框架 + """ + MSE loss for all patches towards the ori image + + Input: + imgs: [B, 3, H, W], Encoder input image + pred: [B, num_patches, p*p*3], Decoder reconstructed image + mask: [B, num_patches], 0 is keep, 1 is remove, + + """ + target = self.patchify(imgs) + + if self.norm_pix_loss: # 把target image patches 标准化 + mean = target.mean(dim=-1, keepdim=True) + var = target.var(dim=-1, keepdim=True) + target = (target - mean) / (var + 1.e-6) ** .5 + + # MSE loss + loss = (pred - target) ** 2 + loss = loss.mean(dim=-1) # [N, L], mean loss per patch + + # binary mask, 1 for removed patches + loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches + return loss + + def forward(self, imgs, mask_ratio=0.75): + # Encoder to obtain latent tokens + latent, mask, ids_restore = self.forward_encoder(imgs, mask_ratio) + # Decoder to obtain Reconstructed image patches + pred = self.forward_decoder(latent, ids_restore) # [N, L, p*p*3] + # MSE loss for all patches towards the ori image + loss = self.forward_loss(imgs, pred, mask) + # print(loss) # todo 这里原文是为了关注loss爆炸, 可能有坑 + return loss, pred, mask + + +def mae_vit_base_patch16_dec512d8b(dec_idx=None, **kwargs): + print("Decoder:", dec_idx) + + model = MaskedAutoencoderViT( + patch_size=16, embed_dim=768, depth=12, num_heads=12, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def mae_vit_large_patch16_dec512d8b(dec_idx=None, **kwargs): + print("Decoder:", dec_idx) + + model = MaskedAutoencoderViT( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def mae_vit_huge_patch14_dec512d8b(dec_idx=None, **kwargs): + print("Decoder:", dec_idx) + + model = MaskedAutoencoderViT( + patch_size=14, embed_dim=1280, depth=32, num_heads=16, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def mae_vit_base_patch16_decoder(dec_idx=None, num_classes=3, img_size=224, **kwargs): + # num_classes做的是one-hot seg但是不是做还原,我们得设计一下如何去做这个还原才能实现预训练 + + if dec_idx == 'swin_unet': + decoder_embed_dim = 768 + decoder_rep_dim = 16 * 16 * 3 + + from SSL_structures.Swin_Unet_main.networks.vision_transformer import SwinUnet as ViT_seg + decoder = ViT_seg(num_classes=num_classes, **kwargs) + + elif dec_idx == 'transunet': + decoder_embed_dim = 768 + decoder_rep_dim = 16 * 16 * 3 + + transunet_name = 'R50-ViT-B_16' + transunet_patches_size = 16 + from SSL_structures.TransUNet_main.networks.vit_seg_modeling import CONFIGS as CONFIGS_Transunet_seg + from SSL_structures.TransUNet_main.networks.vit_seg_modeling import VisionTransformer as Transunet_seg + + config_vit = CONFIGS_Transunet_seg[transunet_name] + config_vit.n_classes = num_classes + config_vit.n_skip = 3 + + if transunet_name.find('R50') != -1: + config_vit.patches.grid = ( + int(img_size / transunet_patches_size), int(img_size / transunet_patches_size)) + decoder = Transunet_seg(config_vit, num_classes=config_vit.n_classes) + + elif dec_idx == 'UTNetV2': + decoder_embed_dim = 768 + decoder_rep_dim = 16 * 16 * 3 + + from SSL_structures.UtnetV2.utnetv2 import UTNetV2 as UTNetV2_seg + decoder = UTNetV2_seg(in_chan=3, num_classes=num_classes) + + else: + print('no effective decoder!') + return -1 + + print('dec_idx: ', dec_idx) + + model = MaskedAutoencoderViT( + patch_size=16, embed_dim=768, depth=12, num_heads=12, + decoder_embed_dim=decoder_embed_dim, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), decoder_rep_dim=decoder_rep_dim, decoder=decoder, + **kwargs) + return model + + +def mae_vit_large_patch16_decoder(dec_idx=None, num_classes=3, img_size=224, **kwargs): + # num_classes做的是one-hot seg但是不是做还原,我们得设计一下如何去做这个还原才能实现预训练 + + if dec_idx == 'swin_unet': + decoder_embed_dim = 768 + decoder_rep_dim = 16 * 16 * 3 + + from SSL_structures.Swin_Unet_main.networks.vision_transformer import SwinUnet as ViT_seg + decoder = ViT_seg(num_classes=num_classes, **kwargs) + + elif dec_idx == 'transunet': + decoder_embed_dim = 768 + decoder_rep_dim = 16 * 16 * 3 + + transunet_name = 'R50-ViT-B_16' + transunet_patches_size = 16 + from SSL_structures.TransUNet_main.networks.vit_seg_modeling import CONFIGS as CONFIGS_Transunet_seg + from SSL_structures.TransUNet_main.networks.vit_seg_modeling import VisionTransformer as Transunet_seg + + config_vit = CONFIGS_Transunet_seg[transunet_name] + config_vit.n_classes = num_classes + config_vit.n_skip = 3 + + if transunet_name.find('R50') != -1: + config_vit.patches.grid = ( + int(img_size / transunet_patches_size), int(img_size / transunet_patches_size)) + decoder = Transunet_seg(config_vit, num_classes=config_vit.n_classes) + + elif dec_idx == 'UTNetV2': + decoder_embed_dim = 768 + decoder_rep_dim = 16 * 16 * 3 + + from SSL_structures.UtnetV2.utnetv2 import UTNetV2 as UTNetV2_seg + decoder = UTNetV2_seg(in_chan=3, num_classes=num_classes) + + else: + print('no effective decoder!') + return -1 + + print('dec_idx: ', dec_idx) + + model = MaskedAutoencoderViT( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, + decoder_embed_dim=decoder_embed_dim, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), decoder_rep_dim=decoder_rep_dim, decoder=decoder, + **kwargs) + return model + + +def mae_vit_huge_patch14_decoder(dec_idx=None, num_classes=3, img_size=224, **kwargs): + # num_classes做的是one-hot seg但是不是做还原,我们得设计一下如何去做这个还原才能实现预训练 + + if dec_idx == 'swin_unet': + decoder_embed_dim = 588 # 1280 14*14*3 + decoder_rep_dim = 14 * 14 * 3 + + from SSL_structures.Swin_Unet_main.networks.vision_transformer import SwinUnet as ViT_seg + decoder = ViT_seg(num_classes=num_classes, **kwargs) + + elif dec_idx == 'transunet': + decoder_embed_dim = 768 + decoder_rep_dim = 16 * 16 * 3 + + transunet_name = 'R50-ViT-B_16' + transunet_patches_size = 16 + from SSL_structures.TransUNet_main.networks.vit_seg_modeling import CONFIGS as CONFIGS_Transunet_seg + from SSL_structures.TransUNet_main.networks.vit_seg_modeling import VisionTransformer as Transunet_seg + + config_vit = CONFIGS_Transunet_seg[transunet_name] + config_vit.n_classes = num_classes + config_vit.n_skip = 3 + + if transunet_name.find('R50') != -1: + config_vit.patches.grid = ( + int(img_size / transunet_patches_size), int(img_size / transunet_patches_size)) + decoder = Transunet_seg(config_vit, num_classes=config_vit.n_classes) + + elif dec_idx == 'UTNetV2': + decoder_embed_dim = 768 + decoder_rep_dim = 14 * 14 * 3 + + from SSL_structures.UtnetV2.utnetv2 import UTNetV2 as UTNetV2_seg + decoder = UTNetV2_seg(in_chan=3, num_classes=num_classes) + + else: + print('no effective decoder!') + return -1 + + print('dec_idx: ', dec_idx) + + model = MaskedAutoencoderViT( + patch_size=14, embed_dim=1280, depth=32, num_heads=16, + decoder_embed_dim=decoder_embed_dim, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), decoder_rep_dim=decoder_rep_dim, decoder=decoder, + **kwargs) + return model + + +# set recommended archs +mae_vit_base_patch16 = mae_vit_base_patch16_dec512d8b # decoder: 512 dim, 8 blocks +mae_vit_large_patch16 = mae_vit_large_patch16_dec512d8b # decoder: 512 dim, 8 blocks +mae_vit_huge_patch14 = mae_vit_huge_patch14_dec512d8b # decoder: 512 dim, 8 blocks + +# Equiped with decoders +mae_vit_base_patch16_decoder = mae_vit_base_patch16_decoder # decoder: 768 dim, HYF +mae_vit_large_patch16_decoder = mae_vit_large_patch16_decoder # decoder: 768 dim, HYF +mae_vit_huge_patch14_decoder = mae_vit_huge_patch14_decoder # decoder: 768 dim, HYF + + +if __name__ == '__main__': + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + img_size = 224 + num_classes = 3 + x = torch.rand(8, 3, img_size, img_size, device=device) + + # model = mae_vit_base_patch16(img_size=224, decoder=None) # decoder_embed_dim=512 + model = mae_vit_base_patch16_decoder(prompt_mode='Deep', Prompt_Token_num=20, basic_state_dict=None, + dec_idx='UTNetV2', img_size=img_size) + + model.to(device) + + loss, pred, mask_patch_indicators = model(x) + + print(loss, '\n') + + print(loss.shape, '\n') + + print(pred.shape, '\n') + + print(mask_patch_indicators.shape, '\n') diff --git a/PuzzleTuning/SSL_structures/pos_embed.py b/PuzzleTuning/SSL_structures/pos_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..0f08cd74cdbdd5ff13b251030b81c8160fd038d7 --- /dev/null +++ b/PuzzleTuning/SSL_structures/pos_embed.py @@ -0,0 +1,100 @@ +""" +Position embedding utils Script ver: Sep 18th 16:30 + +Based on MAE code. +https://github.com/facebookresearch/mae + +""" + +import numpy as np + +import torch + + +# -------------------------------------------------------- +# 2D sine-cosine position embedding +# References: +# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py +# MoCo v3: https://github.com/facebookresearch/moco-v3 +# -------------------------------------------------------- +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid height and width, AKA the num of patch on each direction. + + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + grid_h = np.arange(grid_size, dtype=np.float32) + grid_w = np.arange(grid_size, dtype=np.float32) + + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) # (2, grid_w, grid_h) + grid = grid.reshape([2, 1, grid_size, grid_size]) # (2, 1, grid_w, grid_h) + + # get a 2d positional encoding of (embed_dim, grid) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + + if cls_token: # if the CLS token is here, give it a zero encoding + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) # fixme earlier: dtype=np.float + omega /= embed_dim / 2. + omega = 1. / 10000 ** omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +# -------------------------------------------------------- +# Interpolate position embeddings for high-resolution +# References: +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- +def interpolate_pos_embed(model, checkpoint_model): + if 'pos_embed' in checkpoint_model: + pos_embed_checkpoint = checkpoint_model['pos_embed'] + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = model.patch_embed.num_patches + num_extra_tokens = model.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches ** 0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + checkpoint_model['pos_embed'] = new_pos_embed \ No newline at end of file diff --git a/PuzzleTuning/SSL_structures/temp-tensors/color.pt b/PuzzleTuning/SSL_structures/temp-tensors/color.pt new file mode 100644 index 0000000000000000000000000000000000000000..058c6edd177162c02e2cf7e802a0c9191b5897b4 --- /dev/null +++ b/PuzzleTuning/SSL_structures/temp-tensors/color.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63b58afc3d30f2cead67e4e9d77d0509a9cc547cc0a26390e07204e8f8f9ff0c +size 2409195 diff --git a/PuzzleTuning/SSL_structures/temp-tensors/color_labels.pt b/PuzzleTuning/SSL_structures/temp-tensors/color_labels.pt new file mode 100644 index 0000000000000000000000000000000000000000..135a55b90ab671cb650ff50862582cb120d6e69f --- /dev/null +++ b/PuzzleTuning/SSL_structures/temp-tensors/color_labels.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25f4d6316bbcab066eaf26be4e418547a468d3f5d45c274535306bd37ca814b6 +size 747 diff --git a/PuzzleTuning/SSL_structures/temp-tensors/warwick.pt b/PuzzleTuning/SSL_structures/temp-tensors/warwick.pt new file mode 100644 index 0000000000000000000000000000000000000000..e37fa21a753b5c76591757775ad2e267651af424 --- /dev/null +++ b/PuzzleTuning/SSL_structures/temp-tensors/warwick.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ec6955b6afe4692ee4ead145f50e052e5c4481409beeda42bc4377d6fe8f579 +size 2409195 diff --git a/PuzzleTuning/SSL_structures/temp-tensors/warwick_labels.pt b/PuzzleTuning/SSL_structures/temp-tensors/warwick_labels.pt new file mode 100644 index 0000000000000000000000000000000000000000..edca0ee35b3cf03395e6319763430844f128bbb0 --- /dev/null +++ b/PuzzleTuning/SSL_structures/temp-tensors/warwick_labels.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65c8a01f50cf37cccc74514ada3f2e4d696133652404b2bb4313cfd0fe6df063 +size 747 diff --git a/PuzzleTuning/Test.py b/PuzzleTuning/Test.py new file mode 100644 index 0000000000000000000000000000000000000000..a854908244a66a585ad3e10301b18b58fe07c76b --- /dev/null +++ b/PuzzleTuning/Test.py @@ -0,0 +1,478 @@ +""" +Testing Script ver: Oct 23rd 17:30 +""" + +from __future__ import print_function, division + +import argparse +import json +import time + +import torchvision +from tensorboardX import SummaryWriter + +from Backbone.getmodel import get_model +from Backbone.GetPromptModel import build_promptmodel + +from utils.data_augmentation import * +from utils.visual_usage import * + + +def test_model(model, test_dataloader, criterion, class_names, test_dataset_size, model_idx, test_model_idx, edge_size, + check_minibatch=100, device=None, draw_path='../imaging_results', enable_attention_check=None, + enable_visualize_check=True, writer=None): + """ + Testing iteration + + :param model: model object + :param test_dataloader: the test_dataloader obj + :param criterion: loss func obj + :param class_names: The name of classes for priting + :param test_dataset_size: size of datasets + + :param model_idx: model idx for the getting trained model + :param edge_size: image size for the input image + :param check_minibatch: number of skip over minibatch in calculating the criteria's results etc. + + :param device: cpu/gpu object + :param draw_path: path folder for output pic + :param enable_attention_check: use attention_check to show the pics of models' attention areas + :param enable_visualize_check: use visualize_check to show the pics + + :param writer: attach the records to the tensorboard backend + """ + + # scheduler is an LR scheduler object from torch.optim.lr_scheduler. + if device is None: + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + since = time.time() + + print('Epoch: Test') + print('-' * 10) + + phase = 'test' + index = 0 + model_time = time.time() + + # initiate the empty json dict + json_log = {'test': {}} + + # initiate the empty log dict + log_dict = {} + for cls_idx in range(len(class_names)): + log_dict[class_names[cls_idx]] = {'tp': 0.0, 'tn': 0.0, 'fp': 0.0, 'fn': 0.0} + + model.eval() # Set model to evaluate mode + + # criterias, initially empty + running_loss = 0.0 + log_running_loss = 0.0 + running_corrects = 0 + + # Iterate over data. + for inputs, labels in test_dataloader: # use different dataloder in different phase + inputs = inputs.to(device) + # print('inputs[0]',type(inputs[0])) + + labels = labels.to(device) + + # zero the parameter gradients only need in training + # optimizer.zero_grad() + + # forward + outputs = model(inputs) + _, preds = torch.max(outputs, 1) + loss = criterion(outputs, labels) + + # log criterias: update + log_running_loss += loss.item() + running_loss += loss.item() * inputs.size(0) + running_corrects += torch.sum(preds == labels.data) + + # Compute recision and recall for each class. + for cls_idx in range(len(class_names)): + # NOTICE remember to put tensor back to cpu + tp = np.dot((labels.cpu().data == cls_idx).numpy().astype(int), + (preds == cls_idx).cpu().numpy().astype(int)) + tn = np.dot((labels.cpu().data != cls_idx).numpy().astype(int), + (preds != cls_idx).cpu().numpy().astype(int)) + + fp = np.sum((preds == cls_idx).cpu().numpy()) - tp + + fn = np.sum((labels.cpu().data == cls_idx).numpy()) - tp + + # log_dict[cls_idx] = {'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0} + log_dict[class_names[cls_idx]]['tp'] += tp + log_dict[class_names[cls_idx]]['tn'] += tn + log_dict[class_names[cls_idx]]['fp'] += fp + log_dict[class_names[cls_idx]]['fn'] += fn + + # attach the records to the tensorboard backend + if writer is not None: + # ...log the running loss + writer.add_scalar(phase + ' minibatch loss', + float(loss.item()), + index) + writer.add_scalar(phase + ' minibatch ACC', + float(torch.sum(preds == labels.data) / inputs.size(0)), + index) + + # at the checking time now + if index % check_minibatch == check_minibatch - 1: + model_time = time.time() - model_time + + check_index = index // check_minibatch + 1 + + epoch_idx = 'test' + print('Epoch:', epoch_idx, ' ', phase, 'index of ' + str(check_minibatch) + ' minibatch:', + check_index, ' time used:', model_time) + + print('minibatch AVG loss:', float(log_running_loss) / check_minibatch) + + # how many image u want to check, should SMALLER THAN the batchsize + + if enable_attention_check: + try: + check_SAA(inputs, labels, model, model_idx, edge_size, class_names, num_images=1, + pic_name='GradCAM_' + str(epoch_idx) + '_I_' + str(index + 1), + draw_path=draw_path, writer=writer) + except: + print('model:', model_idx, ' with edge_size', edge_size, 'is not supported yet') + else: + pass + + if enable_visualize_check: + visualize_check(inputs, labels, model, class_names, num_images=-1, + pic_name='Visual_' + str(epoch_idx) + '_I_' + str(index + 1), + draw_path=draw_path, writer=writer) + + model_time = time.time() + log_running_loss = 0.0 + + index += 1 + # json log: update + json_log['test'][phase] = log_dict + + # log criterias: print + epoch_loss = running_loss / test_dataset_size + epoch_acc = running_corrects.double() / test_dataset_size * 100 + print('\nEpoch: {} \nLoss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) + + for cls_idx in range(len(class_names)): + # calculating the confusion matrix + tp = log_dict[class_names[cls_idx]]['tp'] + tn = log_dict[class_names[cls_idx]]['tn'] + fp = log_dict[class_names[cls_idx]]['fp'] + fn = log_dict[class_names[cls_idx]]['fn'] + tp_plus_fp = tp + fp + tp_plus_fn = tp + fn + fp_plus_tn = fp + tn + fn_plus_tn = fn + tn + + # precision + if tp_plus_fp == 0: + precision = 0 + else: + precision = float(tp) / tp_plus_fp * 100 + # recall + if tp_plus_fn == 0: + recall = 0 + else: + recall = float(tp) / tp_plus_fn * 100 + + # TPR (sensitivity) + TPR = recall + + # TNR (specificity) + # FPR + if fp_plus_tn == 0: + TNR = 0 + FPR = 0 + else: + TNR = tn / fp_plus_tn * 100 + FPR = fp / fp_plus_tn * 100 + + # NPV + if fn_plus_tn == 0: + NPV = 0 + else: + NPV = tn / fn_plus_tn * 100 + + print('{} precision: {:.4f} recall: {:.4f}'.format(class_names[cls_idx], precision, recall)) + print('{} sensitivity: {:.4f} specificity: {:.4f}'.format(class_names[cls_idx], TPR, TNR)) + print('{} FPR: {:.4f} NPV: {:.4f}'.format(class_names[cls_idx], FPR, NPV)) + print('{} TP: {}'.format(class_names[cls_idx], tp)) + print('{} TN: {}'.format(class_names[cls_idx], tn)) + print('{} FP: {}'.format(class_names[cls_idx], fp)) + print('{} FN: {}'.format(class_names[cls_idx], fn)) + + print('\n') + + time_elapsed = time.time() - since + print('Testing complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) + + # attach the records to the tensorboard backend + if writer is not None: + writer.close() + + # save json_log indent=2 for better view + json.dump(json_log, open(os.path.join(draw_path, test_model_idx + '_log.json'), 'w'), ensure_ascii=False, indent=2) + + return model + + +def main(args): + if args.paint: + # use Agg kernal, not painting in the front-desk + import matplotlib + matplotlib.use('Agg') + + gpu_idx = args.gpu_idx # GPU idx start with0, -1 to use multiple GPU + + enable_tensorboard = args.enable_tensorboard # False + + enable_attention_check = args.enable_attention_check # False + enable_visualize_check = args.enable_visualize_check # False + + data_augmentation_mode = args.data_augmentation_mode # 0 + + # Prompt + PromptTuning = args.PromptTuning # None "Deep" / "Shallow" + Prompt_Token_num = args.Prompt_Token_num # 20 + PromptUnFreeze = args.PromptUnFreeze # False + + model_idx = args.model_idx # the model we are going to use. by the format of Model_size_other_info + + # structural parameter + drop_rate = args.drop_rate + attn_drop_rate = args.attn_drop_rate + drop_path_rate = args.drop_path_rate + use_cls_token = False if args.cls_token_off else True + use_pos_embedding = False if args.pos_embedding_off else True + use_att_module = None if args.att_module == 'None' else args.att_module + + # PATH info + draw_root = args.draw_root + model_path = args.model_path + dataroot = args.dataroot + model_path_by_hand = args.model_path_by_hand # None + # Pre_Trained model basic for prompt turned model's test + Pre_Trained_model_path = args.Pre_Trained_model_path # None + + # CLS_ is for the CLS trained models, MIL_Stripe will be MIL trained and use Stripe to test + test_model_idx = 'CLS_' + model_idx + '_test' + # NOTICE: MIL model should only be tested in stripe model in this test.py + + draw_path = os.path.join(draw_root, test_model_idx) + + # load Finetuning trained model by its task-based saving name, + # also support MIL-SI model but the MIL_Stripe is required + if model_path_by_hand is None: + # CLS_ is for the CLS training, MIL will be MIL training + save_model_path = os.path.join(model_path, 'CLS_' + model_idx + '.pth') + else: + save_model_path = model_path_by_hand + + if not os.path.exists(draw_path): + os.makedirs(draw_path) + + # choose the test dataset + test_dataroot = os.path.join(dataroot, 'test') + + # dataset info + num_classes = args.num_classes # default 0 for auto-fit + edge_size = args.edge_size + + # validating setting + batch_size = args.batch_size + criterion = nn.CrossEntropyLoss() + + # Data Augmentation is not used in validating or testing + data_transforms = data_augmentation(data_augmentation_mode, edge_size=edge_size) + + # test setting is the same as the validate dataset's setting + test_datasets = torchvision.datasets.ImageFolder(test_dataroot, data_transforms['val']) + test_dataset_size = len(test_datasets) + # skip minibatch none to draw 20 figs + check_minibatch = args.check_minibatch if args.check_minibatch is not None else test_dataset_size // ( + 20 * batch_size) + + test_dataloader = torch.utils.data.DataLoader(test_datasets, batch_size=batch_size, shuffle=False, num_workers=1) + + class_names = [d.name for d in os.scandir(test_dataroot) if d.is_dir()] + class_names.sort() + + if num_classes == 0: + print("class_names:", class_names) + num_classes = len(class_names) + else: + if len(class_names) == num_classes: + print("class_names:", class_names) + else: + print('classfication number of the model mismatch the dataset requirement of:', len(class_names)) + return -1 + + # get model + pretrained_backbone = False # model is trained already, pretrained backbone weight is useless here + + if PromptTuning is None: + model = get_model(num_classes, edge_size, model_idx, drop_rate, attn_drop_rate, drop_path_rate, + pretrained_backbone, use_cls_token, use_pos_embedding, use_att_module) + else: + if Pre_Trained_model_path is not None and os.path.exists(Pre_Trained_model_path): + base_state_dict = torch.load(Pre_Trained_model_path) + else: + base_state_dict = 'timm' + print('base_state_dict of timm') + + print('Test the PromptTuning of ', model_idx) + print('Prompt VPT type:', PromptTuning) + model = build_promptmodel(num_classes, edge_size, model_idx, Prompt_Token_num=Prompt_Token_num, + VPT_type=PromptTuning, base_state_dict=base_state_dict) + + try: + if PromptTuning is None: + model.load_state_dict(torch.load(save_model_path)) + else: + if PromptUnFreeze: + model.load_state_dict(torch.load(save_model_path)) + else: + model.load_prompt(torch.load(save_model_path)) + + print("model loaded") + print("model :", model_idx) + + except: + try: + model = nn.DataParallel(model) + + if PromptTuning is None: + model.load_state_dict(torch.load(save_model_path)) + else: + if PromptUnFreeze: + model.load_state_dict(torch.load(save_model_path)) + else: + model.load_prompt(torch.load(save_model_path)) + + print("DataParallel model loaded") + except: + print("model loading erro!!") + return -1 + + if gpu_idx == -1: + if torch.cuda.device_count() > 1: + print("Use", torch.cuda.device_count(), "GPUs!") + # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs + model = nn.DataParallel(model) + else: + print('we dont have more GPU idx here, try to use gpu_idx=0') + try: + # setting 0 for: only card idx 0 is sighted for this code + os.environ['CUDA_VISIBLE_DEVICES'] = '0' + except: + print("GPU distributing ERRO occur use CPU instead") + + else: + # Decide which device we want to run on + try: + # setting k for: only card idx k is sighted for this code + os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_idx) + except: + print('we dont have that GPU idx here, try to use gpu_idx=0') + try: + # setting 0 for: only card idx 0 is sighted for this code + os.environ['CUDA_VISIBLE_DEVICES'] = '0' + except: + print("GPU distributing ERRO occur use CPU instead") + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # single card for test + + model.to(device) + + # start tensorboard backend + if enable_tensorboard: + writer = SummaryWriter(draw_path) + else: + writer = None + + # if you want to run tensorboard locally + # nohup tensorboard --logdir=/home/experiments/runs --host=0.0.0.0 --port=7777 & + + print("*********************************{}*************************************".format('setting')) + print(args) + + test_model(model, test_dataloader, criterion, class_names, test_dataset_size, model_idx=model_idx, + test_model_idx=test_model_idx, edge_size=edge_size, check_minibatch=check_minibatch, + device=device, draw_path=draw_path, enable_attention_check=enable_attention_check, + enable_visualize_check=enable_visualize_check, writer=writer) + + +def get_args_parser(): + parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') + + # Model Name or index + parser.add_argument('--model_idx', default='ViT_base', type=str, help='Model Name or index') + + # drop_rate, attn_drop_rate, drop_path_rate + parser.add_argument('--drop_rate', default=0.0, type=float, help='dropout rate , default 0.0') + parser.add_argument('--attn_drop_rate', default=0.0, type=float, help='dropout rate Aftter Attention, default 0.0') + parser.add_argument('--drop_path_rate', default=0.0, type=float, help='drop path for stochastic depth, default 0.0') + + # Abalation Studies for MSHT + parser.add_argument('--cls_token_off', action='store_true', help='use cls_token in model structure') + parser.add_argument('--pos_embedding_off', action='store_true', help='use pos_embedding in model structure') + # 'SimAM', 'CBAM', 'SE' 'None' + parser.add_argument('--att_module', default='SimAM', type=str, help='use which att_module in model structure') + + # Enviroment parameters + parser.add_argument('--gpu_idx', default=0, type=int, + help='use a single GPU with its index, -1 to use multiple GPU') + + # Path parameters + parser.add_argument('--dataroot', default=r'/data/k5_dataset', + help='path to dataset') + parser.add_argument('--model_path', default=r'/home/saved_models', + help='root path to save model state-dict, model will be find by name') + parser.add_argument('--draw_root', default=r'/home/runs', + help='path to draw and save tensorboard output') + # model_path_by_hand + parser.add_argument('--model_path_by_hand', default=None, type=str, help='specified path to a model state-dict') + + # Help tool parameters + parser.add_argument('--paint', action='store_false', help='paint in front desk') # matplotlib.use('Agg') + + # check tool parameters + parser.add_argument('--enable_tensorboard', action='store_true', help='enable tensorboard to save status') + + parser.add_argument('--enable_attention_check', action='store_true', help='check and save attention map') + parser.add_argument('--enable_visualize_check', action='store_true', help='check and save pics') + + parser.add_argument('--data_augmentation_mode', default=0, type=int, help='data_augmentation_mode') + + # PromptTuning + parser.add_argument('--PromptTuning', default=None, type=str, + help='use Prompt Tuning strategy instead of Finetuning') + # Prompt_Token_num + parser.add_argument('--Prompt_Token_num', default=20, type=int, help='Prompt_Token_num') + # PromptUnFreeze + parser.add_argument('--PromptUnFreeze', action='store_true', help='prompt tuning with all parameaters un-freezed') + # prompt model basic model path + parser.add_argument('--Pre_Trained_model_path', default=None, type=str, + help='Finetuning a trained model in this dataset') + + # Dataset based parameters + parser.add_argument('--num_classes', default=0, type=int, help='classification number, default 0 for auto-fit') + parser.add_argument('--edge_size', default=384, type=int, help='edge size of input image') # 224 256 384 1000 + + # Test setting parameters + parser.add_argument('--batch_size', default=1, type=int, help='testing batch_size default 1') + # check_minibatch for painting pics + parser.add_argument('--check_minibatch', default=None, type=int, help='check batch_size') + + return parser + + +if __name__ == '__main__': + parser = get_args_parser() + args = parser.parse_args() + main(args) diff --git a/PuzzleTuning/Train.py b/PuzzleTuning/Train.py new file mode 100644 index 0000000000000000000000000000000000000000..b46efb02266af69f1f8875d390a6de116a95601e --- /dev/null +++ b/PuzzleTuning/Train.py @@ -0,0 +1,858 @@ +""" +Training Script ver: Oct 23rd 17:30 +dataset structure: ImageNet +image folder dataset is used. +""" + +from __future__ import print_function, division + +import argparse +import copy +import json +import time +import os +import numpy as np + +import torch +import torch.nn as nn +import torch.optim as optim +import torchvision +from tensorboardX import SummaryWriter +from torch.optim import lr_scheduler +from torchsummary import summary + +from utils.data_augmentation import data_augmentation +from utils.SoftCrossEntropyLoss import SoftlabelCrossEntropy +from utils.online_augmentations import get_online_augmentation +from utils.visual_usage import visualize_check, check_SAA +from utils.tools import setup_seed, del_file, FixStateDict +from utils.schedulers import patch_scheduler, ratio_scheduler + +from Backbone.getmodel import get_model +from Backbone.GetPromptModel import build_promptmodel + + +# Training Strategy +def better_performance(temp_acc, temp_vac, best_acc, best_vac): # determin which epoch have the best model + + if temp_vac >= best_vac and temp_acc >= best_acc: + return True + elif temp_vac > best_vac: + return True + else: + return False + + +def train_model(model, dataloaders, criterion, optimizer, class_names, dataset_sizes, Augmentation=None, + fix_position_ratio_scheduler=None, puzzle_patch_size_scheduler=None, edge_size=384, + model_idx=None, num_epochs=25, intake_epochs=0, check_minibatch=100, scheduler=None, device=None, + draw_path='../imagingresults', enable_attention_check=False, enable_visualize_check=False, + enable_sam=False, writer=None): + """ + Training iteration + :param model: model object + :param dataloaders: 2 dataloader(train and val) dict + :param criterion: loss func obj + :param optimizer: optimizer obj + :param class_names: The name of classes for priting + :param dataset_sizes: size of datasets + :param Augmentation: Online augmentation methods + :param fix_position_ratio_scheduler: Online augmentation fix_position_ratio_scheduler + :param puzzle_patch_size_scheduler: Online augmentation puzzle_patch_size_scheduler + :param edge_size: image size for the input image + :param model_idx: model idx for the getting pre-setted model + :param num_epochs: total training epochs + :param intake_epochs: number of skip over epochs when choosing the best model + :param check_minibatch: number of skip over minibatch in calculating the criteria's results etc. + :param scheduler: scheduler is an LR scheduler object from torch.optim.lr_scheduler. + :param device: cpu/gpu object + :param draw_path: path folder for output pic + :param enable_attention_check: use attention_check to show the pics of models' attention areas + :param enable_visualize_check: use visualize_check to show the pics + :param enable_sam: use SAM training strategy + :param writer: attach the records to the tensorboard backend + """ + + if device is None: + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + since = time.time() + + # for saving the best model state dict + best_model_wts = copy.deepcopy(model.state_dict()) # deepcopy + # initial an empty dict + json_log = {} + + # initial best performance + best_acc = 0.0 + best_vac = 0.0 + temp_acc = 0.0 + temp_vac = 0.0 + best_epoch_idx = 1 + + epoch_loss = 0.0 # initial value for loss-drive + + for epoch in range(num_epochs): + print('Epoch {}/{}'.format(epoch + 1, num_epochs)) + print('-' * 10) + + # record json log, initially empty + json_log[str(epoch + 1)] = {} + + # Each epoch has a training and validation phase + for phase in ['train', 'val']: # alternatively train/val + + index = 0 + check_index = -1 # set a visulize check at the end of each epoch's train and val + model_time = time.time() + + # initiate the empty log dict + log_dict = {} + for cls_idx in range(len(class_names)): + # only float type is allowed in json, set to float inside + log_dict[class_names[cls_idx]] = {'tp': 0.0, 'tn': 0.0, 'fp': 0.0, 'fn': 0.0} + + if phase == 'train': + model.train() # Set model to training mode + else: + model.eval() # Set model to evaluate mode + + # criterias, initially empty + running_loss = 0.0 + log_running_loss = 0.0 + running_corrects = 0 + + # Iterate over data. + for inputs, labels in dataloaders[phase]: # use different dataloder in different phase + + inputs = inputs.to(device) # print('inputs[0]',type(inputs[0])) + # NOTICE in CLS task the labels' type is long tensor([B]),not one-hot ([B,CLS]) + labels = labels.to(device) + + # Online Augmentations on device + if Augmentation is not None: + if phase == 'train': + # cellmix + if fix_position_ratio_scheduler is not None and puzzle_patch_size_scheduler is not None: + # loss-drive + fix_position_ratio = fix_position_ratio_scheduler(epoch, epoch_loss) + puzzle_patch_size = puzzle_patch_size_scheduler(epoch, epoch_loss) + + inputs, labels, GT_long_labels = Augmentation(inputs, labels, + fix_position_ratio, puzzle_patch_size) + # Counterpart augmentations + else: + inputs, labels, GT_long_labels = Augmentation(inputs, labels) + + else: # Val + inputs, labels, GT_long_labels = Augmentation(inputs, labels, act=False) + else: + GT_long_labels = labels # store ori_label on CPU + + # zero the parameter gradients + if not enable_sam: + optimizer.zero_grad() + + # forward + # track grad if only in train! + with torch.set_grad_enabled(phase == 'train'): + + outputs = model(inputs) # pred outputs of confidence: [B,CLS] + _, preds = torch.max(outputs, 1) # idx outputs: [B] each is a idx + loss = criterion(outputs, labels) # cross entrphy of one-hot outputs: [B,CLS] and idx label [B] + + # backward + optimize only if in training phase + if phase == 'train': + if enable_sam: + loss.backward() + # first forward-backward pass + optimizer.first_step(zero_grad=True) + + # second forward-backward pass + loss2 = criterion(model(inputs), labels) # SAM need another model(inputs) + loss2.backward() # make sure to do a full forward pass when using SAM + optimizer.second_step(zero_grad=True) + else: + loss.backward() + optimizer.step() + + # log criterias: update + log_running_loss += loss.item() + running_loss += loss.item() * inputs.size(0) + running_corrects += torch.sum(preds.cpu() == GT_long_labels.cpu().data) + + # Compute precision and recall for each class. + for cls_idx in range(len(class_names)): + tp = np.dot((GT_long_labels.cpu().data == cls_idx).numpy().astype(int), + (preds == cls_idx).cpu().numpy().astype(int)) + tn = np.dot((GT_long_labels.cpu().data != cls_idx).numpy().astype(int), + (preds != cls_idx).cpu().numpy().astype(int)) + + fp = np.sum((preds == cls_idx).cpu().numpy()) - tp + + fn = np.sum((GT_long_labels.cpu().data == cls_idx).numpy()) - tp + + # log_dict[cls_idx] = {'tp': 0.0, 'tn': 0.0, 'fp': 0.0, 'fn': 0.0} set to float inside + log_dict[class_names[cls_idx]]['tp'] += tp + log_dict[class_names[cls_idx]]['tn'] += tn + log_dict[class_names[cls_idx]]['fp'] += fp + log_dict[class_names[cls_idx]]['fn'] += fn + + # attach the records to the tensorboard backend + if writer is not None: + # ...log the running loss + writer.add_scalar(phase + ' minibatch loss', + float(loss.item()), + epoch * len(dataloaders[phase]) + index) + writer.add_scalar(phase + ' minibatch ACC', + float(torch.sum(preds.cpu() == GT_long_labels.cpu().data) / inputs.size(0)), + epoch * len(dataloaders[phase]) + index) + + # at the checking time now + if index % check_minibatch == check_minibatch - 1: + model_time = time.time() - model_time + + check_index = index // check_minibatch + 1 + + epoch_idx = epoch + 1 + print('Epoch:', epoch_idx, ' ', phase, 'index of ' + str(check_minibatch) + ' minibatch:', + check_index, ' time used:', model_time) + + print('minibatch AVG loss:', float(log_running_loss) / check_minibatch) + + if enable_visualize_check: + visualize_check(inputs, GT_long_labels, model, class_names, num_images=-1, + pic_name='Visual_' + phase + '_E_' + str(epoch_idx) + '_I_' + str(index + 1), + draw_path=draw_path, writer=writer) + + if enable_attention_check: + try: + check_SAA(inputs, GT_long_labels, model, model_idx, edge_size, class_names, num_images=1, + pic_name='GradCAM_' + phase + '_E_' + str(epoch_idx) + '_I_' + str(index + 1), + draw_path=draw_path, writer=writer) + except: + print('model:', model_idx, ' with edge_size', edge_size, 'is not supported yet') + else: + pass + + model_time = time.time() + log_running_loss = 0.0 + + index += 1 + + if phase == 'train': + if scheduler is not None: # lr scheduler: update + scheduler.step() + + # at the last of train/val in each epoch, if no check has been triggered + if check_index == -1: + epoch_idx = epoch + 1 + if enable_visualize_check: + visualize_check(inputs, GT_long_labels, model, class_names, num_images=-1, + pic_name='Visual_' + phase + '_E_' + str(epoch_idx), + draw_path=draw_path, writer=writer) + + if enable_attention_check: + try: + check_SAA(inputs, GT_long_labels, model, model_idx, edge_size, class_names, num_images=1, + pic_name='GradCAM_' + phase + '_E_' + str(epoch_idx), + draw_path=draw_path, writer=writer) + except: + print('model:', model_idx, ' with edge_size', edge_size, 'is not supported yet') + else: + pass + + # log criterias: print + epoch_loss = running_loss / dataset_sizes[phase] + epoch_acc = running_corrects.double() / dataset_sizes[phase] * 100 + print('\nEpoch: {} {} \nLoss: {:.4f} Acc: {:.4f}'.format(epoch + 1, phase, epoch_loss, epoch_acc)) + + if phase == 'train' and fix_position_ratio_scheduler is not None \ + and puzzle_patch_size_scheduler is not None: + print('\nEpoch: {}, Fix_position_ratio: {}, Puzzle_patch_size: ' + '{}'.format(epoch + 1, fix_position_ratio, puzzle_patch_size)) + + # attach the records to the tensorboard backend + if writer is not None: + # ...log the running loss + writer.add_scalar(phase + ' loss', + float(epoch_loss), + epoch + 1) + writer.add_scalar(phase + ' ACC', + float(epoch_acc), + epoch + 1) + + # calculating the confusion matrix + for cls_idx in range(len(class_names)): + tp = log_dict[class_names[cls_idx]]['tp'] + tn = log_dict[class_names[cls_idx]]['tn'] + fp = log_dict[class_names[cls_idx]]['fp'] + fn = log_dict[class_names[cls_idx]]['fn'] + tp_plus_fp = tp + fp + tp_plus_fn = tp + fn + fp_plus_tn = fp + tn + fn_plus_tn = fn + tn + + # precision + if tp_plus_fp == 0: + precision = 0 + else: + precision = float(tp) / tp_plus_fp * 100 + # recall + if tp_plus_fn == 0: + recall = 0 + else: + recall = float(tp) / tp_plus_fn * 100 + + # TPR (sensitivity) + TPR = recall + + # TNR (specificity) + # FPR + if fp_plus_tn == 0: + TNR = 0 + FPR = 0 + else: + TNR = tn / fp_plus_tn * 100 + FPR = fp / fp_plus_tn * 100 + + # NPV + if fn_plus_tn == 0: + NPV = 0 + else: + NPV = tn / fn_plus_tn * 100 + + print('{} precision: {:.4f} recall: {:.4f}'.format(class_names[cls_idx], precision, recall)) + print('{} sensitivity: {:.4f} specificity: {:.4f}'.format(class_names[cls_idx], TPR, TNR)) + print('{} FPR: {:.4f} NPV: {:.4f}'.format(class_names[cls_idx], FPR, NPV)) + print('{} TP: {}'.format(class_names[cls_idx], tp)) + print('{} TN: {}'.format(class_names[cls_idx], tn)) + print('{} FP: {}'.format(class_names[cls_idx], fp)) + print('{} FN: {}'.format(class_names[cls_idx], fn)) + # attach the records to the tensorboard backend + if writer is not None: + # ...log the running loss + writer.add_scalar(phase + ' ' + class_names[cls_idx] + ' precision', + precision, + epoch + 1) + writer.add_scalar(phase + ' ' + class_names[cls_idx] + ' recall', + recall, + epoch + 1) + + # json log: update + json_log[str(epoch + 1)][phase] = log_dict + + if phase == 'val': + temp_vac = epoch_acc + else: + temp_acc = epoch_acc # not useful actually + + # deep copy the model + if phase == 'val' and better_performance(temp_acc, temp_vac, best_acc, best_vac) and epoch >= intake_epochs: + # what is better? we now use the wildly used method only + best_epoch_idx = epoch + 1 + best_acc = temp_acc + best_vac = temp_vac + best_model_wts = copy.deepcopy(model.state_dict()) + best_log_dic = log_dict + + print('\n') + + print() + + time_elapsed = time.time() - since + print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) + print('Best epoch idx: ', best_epoch_idx) + print('Best epoch train Acc: {:4f}'.format(best_acc)) + print('Best epoch val Acc: {:4f}'.format(best_vac)) + for cls_idx in range(len(class_names)): + tp = best_log_dic[class_names[cls_idx]]['tp'] + tn = best_log_dic[class_names[cls_idx]]['tn'] + fp = best_log_dic[class_names[cls_idx]]['fp'] + fn = best_log_dic[class_names[cls_idx]]['fn'] + tp_plus_fp = tp + fp + tp_plus_fn = tp + fn + fp_plus_tn = fp + tn + fn_plus_tn = fn + tn + + # precision + if tp_plus_fp == 0: + precision = 0 + else: + precision = float(tp) / tp_plus_fp * 100 + # recall + if tp_plus_fn == 0: + recall = 0 + else: + recall = float(tp) / tp_plus_fn * 100 + + # TPR (sensitivity) + TPR = recall + + # TNR (specificity) + # FPR + if fp_plus_tn == 0: + TNR = 0 + FPR = 0 + else: + TNR = tn / fp_plus_tn * 100 + FPR = fp / fp_plus_tn * 100 + + # NPV + if fn_plus_tn == 0: + NPV = 0 + else: + NPV = tn / fn_plus_tn * 100 + + print('{} precision: {:.4f} recall: {:.4f}'.format(class_names[cls_idx], precision, recall)) + print('{} sensitivity: {:.4f} specificity: {:.4f}'.format(class_names[cls_idx], TPR, TNR)) + print('{} FPR: {:.4f} NPV: {:.4f}'.format(class_names[cls_idx], FPR, NPV)) + + # attach the records to the tensorboard backend + if writer is not None: + writer.close() + + # load best model weights as final model training result + model.load_state_dict(best_model_wts) + # save json_log indent=2 for better view + json.dump(json_log, open(os.path.join(draw_path, model_idx + '_log.json'), 'w'), ensure_ascii=False, indent=2) + return model + + +def main(args): + if args.paint: + # use Agg kernal, not painting in the front-desk + import matplotlib + matplotlib.use('Agg') + + enable_tensorboard = args.enable_tensorboard # True + enable_attention_check = args.enable_attention_check # False 'CAM' 'SAA' + enable_visualize_check = args.enable_visualize_check # False + + enable_sam = args.enable_sam # False + + data_augmentation_mode = args.data_augmentation_mode # 0 + + linearprobing = args.linearprobing # False + + Pre_Trained_model_path = args.Pre_Trained_model_path # None + Prompt_state_path = args.Prompt_state_path # None + + # Prompt + PromptTuning = args.PromptTuning # None "Deep" / "Shallow" + Prompt_Token_num = args.Prompt_Token_num # 20 + PromptUnFreeze = args.PromptUnFreeze # False + + gpu_idx = args.gpu_idx # GPU idx start with0, -1 to use multipel GPU + + # model info + model_idx = args.model_idx # the model we are going to use. by the format of Model_size_other_info + # structural parameter + drop_rate = args.drop_rate + attn_drop_rate = args.attn_drop_rate + drop_path_rate = args.drop_path_rate + use_cls_token = False if args.cls_token_off else True + use_pos_embedding = False if args.pos_embedding_off else True + use_att_module = None if args.att_module == 'None' else args.att_module + + # pretrained_backbone + pretrained_backbone = False if args.backbone_PT_off else True + + # classification required number of your dataset + num_classes = args.num_classes # default 0 for auto-fit + # image size for the input image + edge_size = args.edge_size # 224 384 1000 + + # batch info + batch_size = args.batch_size # 8 + num_workers = args.num_workers # main training num_workers 4 + + num_epochs = args.num_epochs # 50 + intake_epochs = args.intake_epochs # 0 + check_minibatch = args.check_minibatch if args.check_minibatch is not None else 400 // batch_size + + lr = args.lr # 0.000007 + lrf = args.lrf # 0.0 + + opt_name = args.opt_name # 'Adam' + + # PATH info + draw_root = args.draw_root + model_path = args.model_path + dataroot = args.dataroot + + draw_path = os.path.join(draw_root, 'CLS_' + model_idx) # CLS_ is for the CLS training, MIL will be MIL training + save_model_path = os.path.join(model_path, 'CLS_' + model_idx + '.pth') + + if not os.path.exists(model_path): + os.makedirs(model_path) + + if os.path.exists(draw_path): + del_file(draw_path) # fixme clear the output folder, NOTICE this may be DANGEROUS + else: + os.makedirs(draw_path) + + # Train Augmentation + augmentation_name = args.augmentation_name # None + + # Data Augmentation + data_transforms = data_augmentation(data_augmentation_mode, edge_size=edge_size) + + datasets = {x: torchvision.datasets.ImageFolder(os.path.join(dataroot, x), data_transforms[x]) for x in + ['train', 'val']} # 2 dataset obj is prepared here and combine together + dataset_sizes = {x: len(datasets[x]) for x in ['train', 'val']} # size of each dataset + + dataloaders = {'train': torch.utils.data.DataLoader(datasets['train'], batch_size=batch_size, shuffle=True, + num_workers=num_workers, drop_last=True), # colab suggest 2 + 'val': torch.utils.data.DataLoader(datasets['val'], batch_size=batch_size, shuffle=False, + num_workers=num_workers // 4 + 1, drop_last=True) + } + + class_names = [d.name for d in os.scandir(os.path.join(dataroot, 'train')) if d.is_dir()] + class_names.sort() + if num_classes == 0: + print("class_names:", class_names) + num_classes = len(class_names) + else: + if len(class_names) == num_classes: + print("class_names:", class_names) + else: + print('classfication number of the model mismatch the dataset requirement of:', len(class_names)) + return -1 + + print("*********************************{}*************************************".format('setting')) + print(args) + + # start tensorboard backend + if enable_tensorboard: + writer = SummaryWriter(draw_path) + else: + writer = None + # if u run locally + # nohup tensorboard --logdir=/home/MSHT/runs --host=0.0.0.0 --port=7777 & + # tensorboard --logdir=/home/ZTY/runs --host=0.0.0.0 --port=7777 + + if gpu_idx == -1: # use all cards + if torch.cuda.device_count() > 1: + print("Use", torch.cuda.device_count(), "GPUs!") + # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs + gpu_use = gpu_idx + else: + print('we dont have more GPU idx here, try to use gpu_idx=0') + try: + os.environ['CUDA_VISIBLE_DEVICES'] = '0' # setting k for: only card idx k is sighted for this code + gpu_use = 0 + except: + print("GPU distributing ERRO occur use CPU instead") + gpu_use = 'cpu' + + else: + # Decide which device we want to run on + try: + # setting k for: only card idx k is sighted for this code + os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_idx) + gpu_use = gpu_idx + except: + print('we dont have that GPU idx here, try to use gpu_idx=0') + try: + # setting 0 for: only card idx 0 is sighted for this code + os.environ['CUDA_VISIBLE_DEVICES'] = '0' + gpu_use = 0 + except: + print("GPU distributing ERRO occur use CPU instead") + gpu_use = 'cpu' + + # device environment + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # get model + if PromptTuning is not None: + print('PromptTuning of ', model_idx) + print('Prompt VPT type:', PromptTuning) + + # initialize the model backbone: + if Pre_Trained_model_path is None or Pre_Trained_model_path == 'timm': + base_state_dict = 'timm' + print('backbone base_state_dict of timm') + elif Pre_Trained_model_path is not None and os.path.exists(Pre_Trained_model_path): + print('backbone base_state_dict at: ', Pre_Trained_model_path) + base_state_dict = torch.load(Pre_Trained_model_path) + else: + print('invalid Pre_Trained_model_path for prompting at: ', Pre_Trained_model_path) + raise + + # put the additional prompt tokens to model: + if Prompt_state_path is None: + prompt_state_dict = None + print('prompting with empty prompt_state: prompt_state of None') + elif Prompt_state_path is not None and os.path.exists(Prompt_state_path): + print('prompting with prompt_state at: ', Prompt_state_path) + prompt_state_dict = torch.load(Prompt_state_path) + else: + print('invalid prompt_state_dict for prompting, path at:', Prompt_state_path) + raise + + model = build_promptmodel(num_classes, edge_size, model_idx, Prompt_Token_num=Prompt_Token_num, + VPT_type=PromptTuning, prompt_state_dict=prompt_state_dict, + base_state_dict=base_state_dict) + # Use FineTuning with prompt tokens (when PromptUnFreeze==True) + if PromptUnFreeze: + model.UnFreeze() + print('prompt tuning with all parameaters un-freezed') + + else: + # get model: randomly initiate model, except the backbone CNN(when pretrained_backbone is True) + model = get_model(num_classes, edge_size, model_idx, drop_rate, attn_drop_rate, drop_path_rate, + pretrained_backbone, use_cls_token, use_pos_embedding, use_att_module) + + # Manually get the model pretrained on the Imagenet1000 + if Pre_Trained_model_path is not None: + if os.path.exists(Pre_Trained_model_path): + state_dict = FixStateDict(torch.load(Pre_Trained_model_path), remove_key_head='head') + model.load_state_dict(state_dict, False) + print('Specified backbone model weight loaded:', Pre_Trained_model_path) + else: + print('Specified Pre_Trained_model_path:' + Pre_Trained_model_path, ' is NOT avaliable!!!!\n') + raise + else: + print('building model (no-prompt) with pretrained_backbone status:',pretrained_backbone) + if pretrained_backbone is True: + print('timm loaded') + + if linearprobing: + # Only tuning the last FC layer for CLS task + module_all = 0 + for child in model.children(): # find all nn.modules + module_all += 1 + + for param in model.parameters(): # freeze all parameters + param.requires_grad = False + + for module_idx, child in enumerate(model.children()): + if module_idx == module_all: # Unfreeze the parameters of the last FC layer + for param in child.parameters(): + param.requires_grad = True + + print('GPU:', gpu_use) + + if gpu_use == -1: + model = nn.DataParallel(model) + + model.to(device) + + try: + summary(model, input_size=(3, edge_size, edge_size)) # should be after .to(device) + except: + pass + + print("model :", model_idx) + + # Augmentation + Augmentation = get_online_augmentation(augmentation_name, p=0.5, class_num=num_classes, + batch_size=batch_size, edge_size=edge_size, device=device) + + if augmentation_name != 'CellMix-Split' and augmentation_name != 'CellMix-Group' \ + and augmentation_name != 'CellMix-Random': + fix_position_ratio_scheduler = None + puzzle_patch_size_scheduler = None + else: + # setting puzzle_patch_size and fix_position_ratio schedulers + fix_position_ratio_scheduler = ratio_scheduler(total_epoches=num_epochs, + warmup_epochs=0, + basic_ratio=0.5, + strategy=args.ratio_strategy, # 'linear' + fix_position_ratio=args.fix_position_ratio, + threshold=args.loss_drive_threshold) + + puzzle_patch_size_scheduler = patch_scheduler(total_epoches=num_epochs, + warmup_epochs=0, + edge_size=edge_size, + basic_patch=16, + strategy=args.patch_strategy, # 'random', 'linear' or 'loop' + threshold=args.loss_drive_threshold, + fix_patch_size=args.fix_patch_size, # 16,32,48,64,96,128,192 + patch_size_jump=args.patch_size_jump) # 'odd' or 'even' + + # Default cross entrphy of one-hot outputs: [B,CLS] and idx label [B] long tensor + # augmentation loss is SoftlabelCrossEntropy + criterion = SoftlabelCrossEntropy() \ + if Augmentation is not None and augmentation_name != 'Cutout' else nn.CrossEntropyLoss() + + if opt_name == 'SGD': + optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.8, weight_decay=0.005) + scheduler = lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5) # 15 0.1 default SGD StepLR scheduler + elif opt_name == 'Adam': + optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.01) + scheduler = None + else: + print('no optimizer') + raise + + if enable_sam: + from utils.sam import SAM + + if opt_name == 'SGD': + base_optimizer = torch.optim.SGD # define an optimizer for the "sharpness-aware" update + optimizer = SAM(model.parameters(), base_optimizer, lr=lr, momentum=0.8) + scheduler = None + elif opt_name == 'Adam': + base_optimizer = torch.optim.Adam # define an optimizer for the "sharpness-aware" update + optimizer = SAM(model.parameters(), base_optimizer, lr=lr, weight_decay=0.01) + else: + print('no optimizer') + raise + + if lrf > 0: # use cosine learning rate schedule + import math + # cosine Scheduler by https://arxiv.org/pdf/1812.01187.pdf + lf = lambda x: ((1 + math.cos(x * math.pi / num_epochs)) / 2) * (1 - lrf) + lrf # cosine + scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) + + # train + model_ft = train_model(model, dataloaders, criterion, optimizer, class_names, dataset_sizes, + fix_position_ratio_scheduler=fix_position_ratio_scheduler, + puzzle_patch_size_scheduler=puzzle_patch_size_scheduler, + Augmentation=Augmentation, + edge_size=edge_size, model_idx=model_idx, num_epochs=num_epochs, + intake_epochs=intake_epochs, check_minibatch=check_minibatch, + scheduler=scheduler, device=device, draw_path=draw_path, + enable_attention_check=enable_attention_check, + enable_visualize_check=enable_visualize_check, + enable_sam=enable_sam, writer=writer) + + # save model if its a multi-GPU model, save as a single GPU one too + if gpu_use == -1: + + if PromptTuning is None: + torch.save(model_ft.module.state_dict(), save_model_path) + + else: + if PromptUnFreeze: + torch.save(model_ft.module.state_dict(), save_model_path) + else: + prompt_state_dict = model_ft.module.obtain_prompt() + # fixme maybe bug at DP module.obtain_prompt, just model.obtain_prompt is enough + torch.save(prompt_state_dict, save_model_path) + + print('model trained by multi-GPUs has its single GPU copy saved at ', save_model_path) + + else: + if PromptTuning is None: + torch.save(model_ft.state_dict(), save_model_path) + + else: + if PromptUnFreeze: + torch.save(model_ft.state_dict(), save_model_path) + else: + prompt_state_dict = model_ft.obtain_prompt() + torch.save(prompt_state_dict, save_model_path) + + print('model trained by GPU (idx:' + str(gpu_use) + ') has been saved at ', save_model_path) + + +def get_args_parser(): + parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') + + # Model Name or index + parser.add_argument('--model_idx', default='Hybrid2_384_401_testsample', type=str, help='Model Name or index') + # drop_rate, attn_drop_rate, drop_path_rate + parser.add_argument('--drop_rate', default=0.0, type=float, help='dropout rate , default 0.0') + parser.add_argument('--attn_drop_rate', default=0.0, type=float, help='dropout rate Aftter Attention, default 0.0') + parser.add_argument('--drop_path_rate', default=0.0, type=float, help='drop path for stochastic depth, default 0.0') + + # Abalation Studies + parser.add_argument('--cls_token_off', action='store_true', help='use cls_token in model structure') + parser.add_argument('--pos_embedding_off', action='store_true', help='use pos_embedding in model structure') + # 'SimAM', 'CBAM', 'SE' 'None' + parser.add_argument('--att_module', default='SimAM', type=str, help='use which att_module in model structure') + + # backbone_PT_off by default is false, in default setting the backbone weight is required + parser.add_argument('--backbone_PT_off', action='store_true', help='use a freash backbone weight in training') + + # Enviroment parameters + parser.add_argument('--gpu_idx', default=-1, type=int, + help='use a single GPU with its index, -1 to use multiple GPU') + + # Path parameters + parser.add_argument('--dataroot', default='/data/MIL_Experiment/dataset/ROSE_CLS', + help='path to dataset') + parser.add_argument('--model_path', default='/home/pancreatic-cancer-project/saved_models', + help='path to save model state-dict') + parser.add_argument('--draw_root', default='/home/pancreatic-cancer-project/runs', + help='path to draw and save tensorboard output') + + # Help tool parameters + parser.add_argument('--paint', action='store_false', help='paint in front desk') # matplotlib.use('Agg') + + # check tool parameters + parser.add_argument('--enable_tensorboard', action='store_true', help='enable tensorboard to save status') + parser.add_argument('--enable_attention_check', action='store_true', help='check and save attention map') + parser.add_argument('--enable_visualize_check', action='store_true', help='check and save pics') + + # Tuning setting + # PromptTuning + parser.add_argument('--PromptTuning', default=None, type=str, + help='use Prompt Tuning strategy instead of Finetuning') + # Prompt_Token_num + parser.add_argument('--Prompt_Token_num', default=20, type=int, help='Prompt_Token_num') + + # PromptUnFreeze + parser.add_argument('--PromptUnFreeze', action='store_true', help='prompt tuning with all parameaters un-freezed') + + # linearprobing + parser.add_argument('--linearprobing', action='store_true', help='use linearprobing tuning') + + # Finetuning a Pretrained model at PATH + # '/home/MIL_Experiment/saved_models/Hybrid2_384_PreTrain_000.pth' + parser.add_argument('--Pre_Trained_model_path', default=None, type=str, + help='Finetuning a trained model in this dataset') + # Prompt_state_path + parser.add_argument('--Prompt_state_path', default=None, type=str, + help='Prompt_state_path for prompt tokens') + + # Training status parameters + # SAM + parser.add_argument('--enable_sam', action='store_true', help='use SAM strategy in training') + + # Online augmentation_name + parser.add_argument('--augmentation_name', default=None, type=str, help='Online augmentation name') + + # CellMix ablation: loss_drive strategy + parser.add_argument('--ratio_strategy', default=None, type=str, help='CellMix ratio scheduler strategy') + parser.add_argument('--patch_strategy', default=None, type=str, help='CellMix patch scheduler strategy') + parser.add_argument('--loss_drive_threshold', default=4.0, type=float, help='CellMix loss_drive_threshold') + + # CellMix ablation: fix_patch_size patch_size_jump + parser.add_argument('--fix_position_ratio', default=0.5, type=float, help='CellMix ratio scheduler strategy') + parser.add_argument('--fix_patch_size', default=None, type=int, help='CellMix ablation using fix_patch_size') + parser.add_argument('--patch_size_jump', default=None, type=str, help='CellMix patch_size_jump strategy') + + # Dataset based parameters + parser.add_argument('--num_classes', default=0, type=int, help='classification number, default 0 for auto-fit') + parser.add_argument('--edge_size', default=384, type=int, help='edge size of input image') # 224 256 384 1000 + # Dataset specific augmentations in dataloader + parser.add_argument('--data_augmentation_mode', default=0, type=int, help='data_augmentation_mode') + + # Training seting parameters + parser.add_argument('--batch_size', default=8, type=int, help='Training batch_size default 8') + parser.add_argument('--num_epochs', default=50, type=int, help='training epochs') + parser.add_argument('--intake_epochs', default=0, type=int, help='only save model at epochs after intake_epochs') + parser.add_argument('--lr', default=0.00001, type=float, help='learing rate') + parser.add_argument('--lrf', type=float, default=0.0, + help='learing rate decay rate, default 0(not enabled), suggest 0.1 and lr=0.00005') + parser.add_argument('--opt_name', default='Adam', type=str, help='optimizer name Adam or SGD') + + # check_minibatch for painting pics + parser.add_argument('--check_minibatch', default=None, type=int, help='check batch_size') + parser.add_argument('--num_workers', default=2, type=int, help='use CPU num_workers , default 2 for colab') + + return parser + + +if __name__ == '__main__': + # setting up the random seed + setup_seed(42) + + parser = get_args_parser() + args = parser.parse_args() + main(args) diff --git a/PuzzleTuning/dataprocessing/CPIA-main/README.md b/PuzzleTuning/dataprocessing/CPIA-main/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ed8802e13c4d350c90e8496a37669123386aa62a --- /dev/null +++ b/PuzzleTuning/dataprocessing/CPIA-main/README.md @@ -0,0 +1,133 @@ +# CPIA_data_process + +The CPIA dataset contains 3,474,406 (the total number is growing as we continue to process the datasets) standardized images, covering over 50 organs/tissues and about 98 kinds of diseases, which includes two main data types: whole slide images (WSIs) and characteristic regions of interest (ROIs). + +In this repo, we provide relevant codes for processing all sub-datasets within the CPIA dataset. + +![image](https://github.com/Desperadodo/CPIA_data_process/assets/87553719/ec9631e0-c398-4711-9eca-b764333ef10b) +*The compositions and WSI processing strategy of the CPIA dataset.* + +![image](https://github.com/Desperadodo/CPIA_data_process/assets/87553719/2f8660a5-429d-4e42-97f5-8bc5eeb4c587) +*The multi-scale strategy and diverse characteristics of the CPIA dataset.* + +## WSI +Each WSI dataset is divided into four levels using one python program. Most of the Whole Slide Imaging (WSI) images are stored in the SVS format, which includes the micron-per-pixel (MPP) information in the header file. Our processing program can automatically identify the MPP of each image and standardize it to 0.4942um/pixel by adjusting the edge length of each patch. Finally, the patch images are divided into four different sizes with edge lengths of 3840, 960, 384, and 96, respectively, and stored in their respective folders. +| Sub-dataset name | Process Code | Suffix | +|-------------------|--------------|--------| +| CAM16 | CPIA_WSI | tif | +| CATCH dataset | CPIA_WSI | svs | +| CMB-CRC | CPIA_WSI | svs | +| CMB-LCA | CPIA_WSI | svs | +| CMB-MEL | CPIA_WSI | svs | +| CPTAC-AML | CPIA_WSI | svs | +| CPTAC-BRCA | CPIA_WSI | svs | +| CPTAC-CCRCC | CPIA_WSI | svs | +| CPTAC-CM | CPIA_WSI | svs | +| CPTAC-COAD | CPIA_WSI | svs | +| CPTAC-HNSCC | CPIA_WSI | svs | +| CPTAC-LSCC | CPIA_WSI | svs | +| CPTAC-LUAD | CPIA_WSI | svs | +| CPTAC-OV | CPIA_WSI | svs | +| CPTAC-PDA | CPIA_WSI | svs | +| CPTAC-SAR | CPIA_WSI | svs | +| CPTAC-UCEC | CPIA_WSI | svs | +| HER2 tumor ROIs | CPIA_WSI | svs | +| MSKCC(SLN-Breast) | CPIA_WSI | svs | +| PAIP2019 | CPIA_WSI | svs | +| PAIP2020 | CPIA_WSI | svs | +| PAIP2021 | CPIA_WSI | svs | +| Post-NAT-BRCA | CPIA_WSI | svs | +| TCGA-ACC | CPIA_WSI | svs | +| TCGA-BLCA | CPIA_WSI | svs | +| TCGA-BRCA | CPIA_WSI | svs | +| TCGA-CESC | CPIA_WSI | svs | +| TCGA-CHOL | CPIA_WSI | svs | +| TCGA-COAD | CPIA_WSI | svs | +| TCGA-DLBC | CPIA_WSI | svs | +| TCGA-ESCA | CPIA_WSI | svs | +| TCGA-GBM | CPIA_WSI | svs | +| TCGA-HNSC | CPIA_WSI | svs | +| TCGA-KICH | CPIA_WSI | svs | +| TCGA-KIRC | CPIA_WSI | svs | +| TCGA-KIRP | CPIA_WSI | svs | +| TCGA-LGG | CPIA_WSI | svs | +| TCGA-LIHC | CPIA_WSI | svs | +| TCGA-LUAD | CPIA_WSI | svs | +| TCGA-LUSC | CPIA_WSI | svs | +| TCGA-MESO | CPIA_WSI | svs | +| TCGA-OV | CPIA_WSI | svs | +| TCGA-PAAD | CPIA_WSI | svs | +| TCGA-PCPG | CPIA_WSI | svs | +| TCGA-PRAD | CPIA_WSI | svs | +| TCGA-READ | CPIA_WSI | svs | +| TCGA-SARC | CPIA_WSI | svs | +| TCGA-SKCM | CPIA_WSI | svs | +| TCGA-STAD | CPIA_WSI | svs | +| TCGA-TGCT | CPIA_WSI | svs | +| TCGA-THCA | CPIA_WSI | svs | +| TCGA-THYM | CPIA_WSI | svs | +| TCGA-UCEC | CPIA_WSI | svs | +| TCGA-UCS | CPIA_WSI | svs | +| TCGA-UVM | CPIA_WSI | svs | + + +## ROI +The processing code for the ROI dataset is related to the structure of the original dataset. After the processing is complete, each dataset folder contains only all images of that dataset, with each image having a dimension of 384x384 pixels, and is stored in jpg format. + +Before processing the data, please ensure that only the target images to be processed are contained in the input path of the program, and enter the correct suffix of the images to be processed (this might require slight adjustments to the original dataset folder). + +| Sub-dataset name | Process Code | Suffix | Add_class | +|------------------------------------------------------|------------------------------------------------|--------|-----------| +| ANHIR | CPIA_ROI_1_Crop&Resize | png | FALSE | +| BCSS | CPIA_ROI_1_Crop&Resize | png | FALSE | +| AML_Cytomorphology | CPIA_ROI_1_Crop&Resize | tiff | FALSE | +| BCCD | CPIA_ROI_1_Crop&Resize | jpg | FALSE | +| Blood_Cell_Images | CPIA_ROI_1_Crop&Resize | jpg | TRUE | +| BreakHis | CPIA_ROI_0_BreakHis + CPIA_ROI_1_Crop&Resize | jpg | TRUE | +| Breast_Histopathology_Images | CPIA_ROI_1_Crop&Resize | png | FALSE | +| BreastCancerCells | CPIA_ROI_1_Crop&Resize | tif | FALSE | +| BreastPathQ | CPIA_ROI_1_Crop&Resize | tif | FALSE | +| BreCaHAD | CPIA_ROI_1_Crop&Resize | tif | FALSE | +| Chaoyang | CPIA_ROI_1_Crop&Resize | jpg | FALSE | +| Colorectal Histology MNIST | CPIA_ROI_1_Crop&Resize | tif | FALSE | +| CoNSeP | CPIA_ROI_1_Crop&Resize | png | FALSE | +| CPM_15 | CPIA_ROI_1_Crop&Resize | png | FALSE | +| CPM_17 | CPIA_ROI_1_Crop&Resize | png | TRUE | +| CRAG | CPIA_ROI_1_Crop&Resize | jpg | FALSE | +| CRChistophenotypes | CPIA_ROI_1_Crop&Resize | bmp | FALSE | +| CRC-TP | CPIA_ROI_1_Crop&Resize | png | TRUE | +| CRC-VAL-HE-7K | CPIA_ROI_1_Crop&Resize | jpg | FALSE | +| CryoNuSeg | CPIA_ROI_1_Crop&Resize | tif | FALSE | +| DataBiox | CPIA_ROI_1_MicroScope DataBiox | jpg | FALSE | +| GasHisSDB | CPIA_ROI_0_GasHisSDB + CPIA_ROI_1_Crop&Resize | jpg | FALSE | +| Gastrointestinal_Cancer | CPIA_ROI_1_Crop&Resize | jpg | FALSE | +| Gleason 2019 | CPIA_ROI_1_Crop&Resize | jpg | FALSE | +| HuBMAP+HBA_512 | CPIA_ROI_1_Crop&Resize | png | FALSE | +| Kumar(MoNuSeg) | CPIA_ROI_1_Crop&Resize | tif | FALSE | +| LISC | CPIA_ROI_1_Crop&Resize | bmp | FALSE | +| LizardDataset | CPIA_ROI_1_Crop&Resize | png | FALSE | +| Lung_and_Colon_Cancer_Histopathological_Images_Colon | CPIA_ROI_1_Crop&Resize | jpg | TRUE | +| Lung_and_Colon_Cancer_Histopathological_Images_Lung | CPIA_ROI_1_Crop&Resize | jpg | TRUE | +| LYON19 | CPIA_ROI_1_Crop&Resize | png | FALSE | +| Malignant_Lymphoma_Dataset | CPIA_ROI_1_Crop&Resize | tif | FALSE | +| MARS | CPIA_ROI_1_Crop&Resize | png | FALSE | +| MHIST | CPIA_ROI_1_Crop&Resize | png | FALSE | +| MoNuSAC | CPIA_ROI_1_Crop&Resize | tif | FALSE | +| Monuseg_train&test | CPIA_ROI_1_Crop&Resize | png | FALSE | +| Naylor et al, IEEE TMI 2019 | CPIA_ROI_1_Crop&Resize | png | FALSE | +| NCT-CRC-HE-100K | CPIA_ROI_1_Crop&Resize | tif | FALSE | +| Osteosarcoma_Tumor_Assessment | CPIA_ROI_1_Crop&Resize | jpg | FALSE | +| P_falciparum | CPIA_ROI_1_MicroScope P.falciparum | jpg | FALSE | +| P_uninfected | CPIA_ROI_1_MicroScope P.uninfected | jpg | FALSE | +| P_vivax | CPIA_ROI_1_MicroScope P.vivax | jpg | FALSE | +| PCam | CPIA_ROI_1_Crop&Resize | tif | FALSE | +| SICAPv2 | CPIA_ROI_1_Crop&Resize | jpg | FALSE | +| SIPaKMeD | CPIA_ROI_0_SIPaKMeD + CPIA_ROI_1_Crop&Resize | jpg | FALSE | +| warwick_CLS | CPIA_ROI_0_Warwick + CPIA_ROI_1_Crop&Resize | jpg | FALSE | +| WBC | CPIA_ROI_1_Crop&Resize | jpg | TRUE | +| Weinart et al, Scientific Reports 2012 | CPIA_ROI_1_Crop&Resize | jpg | FALSE | +| WSSS4LUAD | CPIA_ROI_1_Crop&Resize | png | TRUE | + + + + diff --git a/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_0_BreakHis.py b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_0_BreakHis.py new file mode 100644 index 0000000000000000000000000000000000000000..472eac90aab23ae1e31b7c139dbf193851d8a1b9 --- /dev/null +++ b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_0_BreakHis.py @@ -0,0 +1,120 @@ +""" +CPIA_ROI_0_BreakHis.py ver 23.6.9 +This code aims to split images of different zooming size into different folders,this code also puts different classes +into different folders +""" +import argparse +import os +from PIL import Image +from tqdm import tqdm + + +def get_args_parser(): + parser = argparse.ArgumentParser('CPIA dataset ROI part Warwick_QU dataset pre-processing', add_help=False) + parser.add_argument('--input_root', default='..', type=str, + help='The root that contains the orginal images. Please make sure that there is no unwanted ' + 'images with corresponding suffix under the same root') + parser.add_argument('--output_root', default='..', type=str, + help='The root for the resized and cropped output images. If the root is not provided, this ' + 'program will automatically make an output path') + return parser + + +def make_and_clear_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + + +def find_all_files(root, suffix=None): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + print(res) + print(len(res)) + return res + + +def save_file(f_image, save_dir, suffix='.jpg'): + filepath, _ = os.path.split(save_dir) + if not os.path.exists(filepath): + os.makedirs(filepath) + f_image.save(save_dir + suffix) + + +def pc_to_stander(root_from, root_to): + root_target = root_to + make_and_clear_path(root_target) + + f_dir_list = find_all_files(root=root_from, suffix=".png") + print(f_dir_list) + name_dict = {} + + for seq in tqdm(range(len(f_dir_list))): + f_dir = f_dir_list[seq] + _, str = os.path.split(f_dir) + mp = str.split("-")[-2] + type = (str.split("_")[2]).split("-")[0] + name = str.split(".")[0] + print(mp) + print(type) + + f_img = Image.open(f_dir) + if mp == '40': + root_target = os.path.join(root_to, '40') + elif mp == '100': + root_target = os.path.join(root_to, '100') + elif mp == '200': + root_target = os.path.join(root_to, '200') + else: + root_target = os.path.join(root_to, '400') + if type == 'DC': + root_target = os.path.join(root_target, 'ductal_carcinoma') + elif type == 'LC': + root_target = os.path.join(root_target, 'lobular_carcinoma') + elif type == 'MC': + root_target = os.path.join(root_target, 'mucinous_carcinoma') + elif type == 'PC': + root_target = os.path.join(root_target, 'papillary_carcinoma') + elif type == 'A': + root_target = os.path.join(root_target, 'adenosis') + elif type == 'F': + root_target = os.path.join(root_target, 'fibroadenoma') + elif type == 'PT': + root_target = os.path.join(root_target, 'phyllodes_tumor') + else: + root_target = os.path.join(root_target, 'tubular_adenoma') + + save_dir = os.path.join(root_target, name) + + + name_dict[save_dir] = f_dir + + save_file(f_img, save_dir) + + root_target, _ = os.path.split(root_to) + root_target, _ = os.path.split(root_target) + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + + input_root = args.input_root + output_root = args.output_root + + pc_to_stander(input_root, output_root) + + + + + + + + + + diff --git a/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_0_GasHisSDB.py b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_0_GasHisSDB.py new file mode 100644 index 0000000000000000000000000000000000000000..79edb18cec99a0f39c6d21fccb82440961ad4ae7 --- /dev/null +++ b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_0_GasHisSDB.py @@ -0,0 +1,131 @@ +""" +CPIA_ROI_0_GasHisSDB.py ver 23.6.9 +This code aims to split images of different zooming size into different folders, this code also puts different classes +into different folders +""" + +import argparse +import os +import re +import csv +import shutil +import pandas as pd +from PIL import Image +from tqdm import tqdm +import torchvision.transforms + + +def get_args_parser(): + parser = argparse.ArgumentParser('CPIA dataset ROI part Warwick_QU dataset pre-processing', add_help=False) + parser.add_argument('--input_root', default='..', type=str, + help='The root that contains the orginal images. Please make sure that there is no unwanted ' + 'images with corresponding suffix under the same root') + parser.add_argument('--output_root', default='..', type=str, + help='The root for the resized and cropped output images. If the root is not provided, this ' + 'program will automatically make an output path') + return parser + + +def del_file(filepath): + """ + Delete all files and folders in one directory + :param filepath: file path + :return: + """ + del_list = os.listdir(filepath) + for f in del_list: + file_path = os.path.join(filepath, f) + if os.path.isfile(file_path): + os.remove(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + + +def make_and_clear_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + + +def find_all_files(root, suffix=None): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + print(files) + return res + + +def save_file(f_image, save_dir, suffix='.jpg'): + + filepath, _ = os.path.split(save_dir) + if not os.path.exists(filepath): + os.makedirs(filepath) + f_image.save(save_dir + suffix) + + +def pc_to_stander(root_from, root_to): + root_target = root_to + make_and_clear_path(root_target) + + f_dir_list = find_all_files(root=root_from, suffix=".png") + print(f_dir_list) + name_dict = {} + + for seq in tqdm(range(len(f_dir_list))): + f_dir = f_dir_list[seq] + f_img = Image.open(f_dir) + _, img_name = os.path.split(f_dir) + name = img_name.split('.')[0] + + if '80' in f_dir: + root_target = os.path.join(root_to, '80') + elif '120' in f_dir: + root_target = os.path.join(root_to, '120') + else: + root_target = os.path.join(root_to, '160') + + if 'Normal' in f_dir: + save_dir = os.path.join(root_target, 'Normal') + else: + save_dir = os.path.join(root_target, 'Abnormal') + + """if img_name == result[i][0]: + root_target = os.path.join(root_to, result[i][1]) + if result[i][1] == 0: + i1 += 1 + save_dir = os.path.join(root_target, str(i1)) + elif result[i][1] == 1: + i2 += 1 + save_dir = os.path.join(root_target, str(i2)) + else: + i3 += 1 + save_dir = os.path.join(root_target, str(i3)) + break + else: + continue""" + save_dir = os.path.join(save_dir, name) + + name_dict[save_dir] = f_dir + + save_file(f_img, save_dir) + + root_target, _ = os.path.split(root_to) + root_target, _ = os.path.split(root_target) + pd.DataFrame.from_dict(name_dict, orient='index', columns=['origin path']).to_csv( + os.path.join(root_target, 'name_dict_gashis.csv') + ) + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + + input_root = args.input_root + output_root = args.output_root + + pc_to_stander(input_root, output_root) \ No newline at end of file diff --git a/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_0_SIPaKMeD.py b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_0_SIPaKMeD.py new file mode 100644 index 0000000000000000000000000000000000000000..32ac18d3f8bfc9754957504980f9fe8bef056db1 --- /dev/null +++ b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_0_SIPaKMeD.py @@ -0,0 +1,142 @@ +""" +CPIA_ROI_0_SIPaKMeD.py +This code aims to split test/train pictures and img/annotation pictures into different folders +The original SIPaKMeD dataset has a CROPPED part inside the folder of each class. This code splits +the original view images and cropped images into two folders. +""" +import argparse +import os +import re +import csv +import shutil +import pandas as pd +from PIL import Image +from tqdm import tqdm +import torchvision.transforms + + +def get_args_parser(): + parser = argparse.ArgumentParser('CPIA dataset ROI part Warwick_QU dataset pre-processing', add_help=False) + parser.add_argument('--input_root', default='..', type=str, + help='The root that contains the orginal images. Please make sure that there is no unwanted ' + 'images with corresponding suffix under the same root') + parser.add_argument('--output_root', default='..', type=str, + help='The root for the resized and cropped output images. If the root is not provided, this ' + 'program will automatically make an output path') + return parser + + +def del_file(filepath): + """ + Delete all files and folders in one directory + :param filepath: file path + :return: + """ + del_list = os.listdir(filepath) + for f in del_list: + file_path = os.path.join(filepath, f) + if os.path.isfile(file_path): + os.remove(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + + +def make_and_clear_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + # del_file(file_pack_path) + + +def find_all_files(root, suffix=None): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + return res + + +def save_file(f_image, save_dir, suffix='.jpg'): + filepath, _ = os.path.split(save_dir) + if not os.path.exists(filepath): + os.makedirs(filepath) + f_image.save(save_dir + suffix) + + +def pc_to_stander(root_from, root_to): + root_target = root_to + make_and_clear_path(root_target) + + f_dir_list = find_all_files(root=root_from, suffix=".bmp") + print(f_dir_list) + name_dict = {} + i1 = 0 + i2 = 0 + i3 = 0 + i4 = 0 + i5 = 0 + + for seq in tqdm(range(len(f_dir_list))): + f_dir = f_dir_list[seq] + + f_img = Image.open(f_dir) + + + if 'im_Superficial-Intermediate' in f_dir: + if 'CROPPED' in f_dir: + root_target = os.path.join(root_to, 'Cropped') + else: + root_target = os.path.join(root_to, "Full") + root_target = os.path.join(root_target, 'Sup_Intermediate') + i1 += 1 + save_dir = os.path.join(root_target, str(i1)) + elif 'im_Parabasal' in f_dir: + if 'CROPPED' in f_dir: + root_target = os.path.join(root_to, 'Cropped') + else: + root_target = os.path.join(root_to, "Full") + root_target = os.path.join(root_target, 'Parabasal') + i2 += 1 + save_dir = os.path.join(root_target, str(i2)) + elif 'im_Dyskeratotic' in f_dir: + if 'CROPPED' in f_dir: + root_target = os.path.join(root_to, 'Cropped') + else: + root_target = os.path.join(root_to, "Full") + root_target = os.path.join(root_target, 'Dyskeratotic') + i3 += 1 + save_dir = os.path.join(root_target, str(i3)) + elif 'im_Koilocytotic' in f_dir: + if 'CROPPED' in f_dir: + root_target = os.path.join(root_to, 'Cropped') + else: + root_target = os.path.join(root_to, "Full") + root_target = os.path.join(root_target, 'Koilocytotic') + i4 += 1 + save_dir = os.path.join(root_target, str(i4)) + else: + if 'CROPPED' in f_dir: + root_target = os.path.join(root_to, 'Cropped') + else: + root_target = os.path.join(root_to, "Full") + root_target = os.path.join(root_target, 'Metaplastic') + i5 += 1 + save_dir = os.path.join(root_target, str(i5)) + + name_dict[save_dir] = f_dir + + save_file(f_img, save_dir) + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + + input_root = args.input_root + output_root = args.output_root + + pc_to_stander(input_root, output_root) \ No newline at end of file diff --git a/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_0_Warwick.py b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_0_Warwick.py new file mode 100644 index 0000000000000000000000000000000000000000..2b21d305850b49750299f977474706d8414e4ef9 --- /dev/null +++ b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_0_Warwick.py @@ -0,0 +1,127 @@ +""" +CPIA_ROI_0_Warwick.py ver 23.6.9 +This code aims to split test/train pictures and img/annotation pictures into different folders +""" +import argparse +import os +import csv +import shutil +import pandas as pd +from PIL import Image +from tqdm import tqdm + + +def get_args_parser(): + parser = argparse.ArgumentParser('CPIA dataset ROI part Warwick_QU dataset pre-processing', add_help=False) + parser.add_argument('--input_root', default='..', type=str, + help='The root that contains the orginal images. Please make sure that there is no unwanted ' + 'images with corresponding suffix under the same root') + parser.add_argument('--output_root', default='..', type=str, + help='The root for the resized and cropped output images. If the root is not provided, this ' + 'program will automatically make an output path') + return parser + +def del_file(filepath): + """ + Delete all files and folders in one directory + :param filepath: file path + :return: + """ + del_list = os.listdir(filepath) + for f in del_list: + file_path = os.path.join(filepath, f) + if os.path.isfile(file_path): + os.remove(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + + +def make_and_clear_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + # del_file(file_pack_path) + + +def find_all_files(root, suffix=None): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + print(files) + return res + + +def save_file(f_image, save_dir, suffix='.jpg'): + filepath, _ = os.path.split(save_dir) + if not os.path.exists(filepath): + os.makedirs(filepath) + f_image.save(save_dir + suffix) + + +def pc_to_stander(root_from, root_to): + root_target = root_to + make_and_clear_path(root_target) + + f_dir_list = find_all_files(root=root_from, suffix=".bmp") + print(f_dir_list) + name_dict = {} + + with open(r'E:\A_bunch_of_data\Raw\Warwick QU Dataset (Released 2016_07_08)\Grade.csv', 'r') as f: + reader = csv.reader(f) + result = list(reader) + length = len(result) + + for seq in tqdm(range(len(f_dir_list))): + f_dir = f_dir_list[seq] + f_img = Image.open(f_dir) + _, img_name = os.path.split(f_dir) + img_name = img_name.split('.')[0] + + if img_name[0:4] == 'test': + save_dir = os.path.join(root_target, 'test') + else: + save_dir = os.path.join(root_target, 'train') + + if img_name[-4:] == 'anno': + save_dir = os.path.join(save_dir, 'mask') + img_name = img_name[0:-5] + else: + save_dir = os.path.join(save_dir, 'data') + + for i in range(length): + # print(img_name.split('.')[0]) + l = len(result[i][0]) + if img_name == result[i][0]: + if result[i][2] == ' malignant': + save_dir = os.path.join(save_dir, 'malignant') + print('1') + else: + save_dir = os.path.join(save_dir, 'benign') + break + else: + continue + save_dir = os.path.join(save_dir, img_name.split('.')[0]) + name_dict[save_dir] = f_dir + save_file(f_img, save_dir) + + root_target, _ = os.path.split(root_to) + root_target, _ = os.path.split(root_target) + pd.DataFrame.from_dict(name_dict, orient='index', columns=['origin path']).to_csv( + os.path.join(root_target, 'name_dict_warwick.csv') + ) + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + + input_root = args.input_root + output_root = args.output_root + + pc_to_stander(input_root, output_root) + diff --git a/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_1_Crop&Resize.py b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_1_Crop&Resize.py new file mode 100644 index 0000000000000000000000000000000000000000..78aee4185a083d0c59e73af2dc875686d8cea86e --- /dev/null +++ b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_1_Crop&Resize.py @@ -0,0 +1,112 @@ +""" +CPIA_ROI_1_Crop&Resize.py ver 23.6.8 +This code aims to crop each ROI image by the largest center square, and resize the square image into 384*384 +""" +import argparse +import os +import PIL.Image as Image +from PIL import ImageFile + +ImageFile.LOAD_TRUNCATED_IMAGES = True +Image.MAX_IMAGE_PIXELS = None + + +def get_args_parser(): + parser = argparse.ArgumentParser('CPIA dataset ROI part image cropping and resizing', add_help=False) + parser.add_argument('--input_root', default='..', type=str, + help='The root that contains the orginal images. Please make sure that there is no unwanted ' + 'images with corresponding suffix under the same root') + parser.add_argument('--output_root', default=None, type=str, + help='The root for the resized and cropped output images. If the root is not provided, this ' + 'program will automatically make an output path') + parser.add_argument('--suffix', default='jpg', type=str, + help='The suffix of the input image') + parser.add_argument('--size', default=384, type=int, + help='The size of the output image') + parser.add_argument('--add_class', default=False, type=bool, + help='Add class information to the image name.') + + return parser + + +def save_file(f_image, save_dir, suffix='.jpg'): + """ + Save images with designated suffix + """ + f_image = f_image.convert('RGB') + filepath, _ = os.path.split(save_dir) + if not os.path.exists(filepath): + os.makedirs(filepath) + f_image.save(save_dir + suffix) + + +def make_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + + +def find_all_files(root, suffix): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + print(files) + return res + + +def center_crop(img_size): + """ + Return the cropping zone of a non-square image + :param img_size: img.size + :return: list that contains the cropping zone + """ + width, height = img_size # Get dimensions + a = min(width, height) + left = int((width - a) / 2) + top = int((height - a) / 2) + right = left + a + bottom = top + a + + return [left, top, right, bottom] + + +def data_crop_resize(class_root, output_root, suffix, size=384, add_class=False): + all_data = find_all_files(class_root, suffix) + for data_root in all_data: + if data_root.endswith('.txt'): + continue + elif data_root.endswith('.DS_Store'): + continue + elif output_root is None: + new_data_root = (data_root + '_Lite').split('.')[0] + # specially made for GS dataset: + """new_data_root = (data_root + '_Lite').replace('.', '_') + new_data_root = new_data_root.replace('_jpg', '')""" + else: + data_name_without_suffix = os.path.split(data_root)[1].split('.')[0] + if add_class: + class_name = os.path.split(data_root.split('.')[0])[1] + data_name_without_suffix = class_name + '_' + data_name_without_suffix + new_data_root = os.path.join(output_root, data_name_without_suffix) + + img = Image.open(data_root) + img = img.crop(center_crop(img.size)) + resized_img = img.resize((int(size), int(size)), Image.ANTIALIAS) + save_file(resized_img, new_data_root) + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + + input_root = args.input_root + output_root = args.output_root + suffix = args.suffix + size = args.size + add_class = args.add_class + data_crop_resize(input_root, output_root, suffix, size, add_class) diff --git a/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_1_Microscope_DataBiox.py b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_1_Microscope_DataBiox.py new file mode 100644 index 0000000000000000000000000000000000000000..1112b6dca32fcec9295aa6ea2b649fc44472f632 --- /dev/null +++ b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_1_Microscope_DataBiox.py @@ -0,0 +1,99 @@ +""" +CPIA_ROI_1_Microscope_DataBiox.py ver 23.6.8 +This code aims to crop each Microscope image by the largest center square, keeping the black surroundings out of the +final image. +Also, all the images will be resized into 384*384 (or other size you want). +""" +import argparse +import os +import PIL.Image as Image +from PIL import ImageFile + +ImageFile.LOAD_TRUNCATED_IMAGES = True +Image.MAX_IMAGE_PIXELS = None + + +def get_args_parser(): + parser = argparse.ArgumentParser('CPIA dataset ROI part image cropping and resizing', add_help=False) + parser.add_argument('--input_root', default='..', type=str, + help='The root that contains the orginal images. Please make sure that there is no unwanted ' + 'images with corresponding suffix under the same root') + parser.add_argument('--output_root', default=None, type=str, + help='The root for the resized and cropped output images. If the root is not provided, this ' + 'program will automatically make an output path') + parser.add_argument('--suffix', default='jpg', type=str, + help='The suffix of the input image') + parser.add_argument('--size', default=384, type=int, + help='The size of the output image') + return argparse + + +def save_file(f_image, save_dir, suffix='.jpg'): + """ + Save images with designated suffix + """ + f_image = f_image.convert('RGB') + filepath, _ = os.path.split(save_dir) + if not os.path.exists(filepath): + os.makedirs(filepath) + f_image.save(save_dir + suffix) + + +def make_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + + +def find_all_files(root, suffix): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + print(files) + return res + + +def data_crop_resize(class_root, output_root, suffix, size=384): + all_data = find_all_files(class_root, suffix) + for data_root in all_data: + if data_root.endswith('.txt'): + continue + elif data_root.endswith('.DS_Store'): + continue + elif output_root is None: + new_data_root = (data_root + '_Lite').split('.')[0] + else: + data_name_without_suffix = os.path.split(data_root)[1].split('.')[0] + new_data_root = os.path.join(output_root, data_name_without_suffix) + + img = Image.open(data_root) + width, height = img.size # Get size + + # the cropping parameters are tuned for DataBiox + s = min(width, height) + a = int((s * 5) / 7) + left = int((width - a) / 2) + top = int((height - a) / 2) + right = left + a + bottom = top + a + + # Crop the center of the image + img = img.crop([left, top, right, bottom]) + resized_img = img.resize((int(size), int(size)), Image.ANTIALIAS) + save_file(resized_img, new_data_root) + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + + input_root = args.input_root + output_root = args.output_root + suffix = args.suffix + size = args.size + data_crop_resize(input_root, output_root, suffix, size) diff --git a/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_1_Microscope_P_falciparum.py b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_1_Microscope_P_falciparum.py new file mode 100644 index 0000000000000000000000000000000000000000..3e9309013c4ea82a4fc5ef5b2b8a13a640905561 --- /dev/null +++ b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_1_Microscope_P_falciparum.py @@ -0,0 +1,98 @@ +""" +CPIA_ROI_1_Microscope_P_falciparum.py ver 23.6.8 +This code aims to crop each Microscope image by the largest center square, keeping the black surroundings out of the +final image +Also, all the images will be resized into 384*384 (or other size you want). +""" +import argparse +import os +import PIL.Image as Image +from PIL import ImageFile +ImageFile.LOAD_TRUNCATED_IMAGES = True +Image.MAX_IMAGE_PIXELS = None + + +def get_args_parser(): + parser = argparse.ArgumentParser('CPIA dataset ROI part image cropping and resizing', add_help=False) + parser.add_argument('--input_root', default='..', type=str, + help='The root that contains the orginal images. Please make sure that there is no unwanted ' + 'images with corresponding suffix under the same root') + parser.add_argument('--output_root', default=None, type=str, + help='The root for the resized and cropped output images. If the root is not provided, this ' + 'program will automatically make an output path') + parser.add_argument('--suffix', default='jpg', type=str, + help='The suffix of the input image') + parser.add_argument('--size', default=384, type=int, + help='The size of the output image') + return argparse + + +def save_file(f_image, save_dir, suffix='.jpg'): + """ + Save images with designated suffix + """ + f_image = f_image.convert('RGB') + filepath, _ = os.path.split(save_dir) + if not os.path.exists(filepath): + os.makedirs(filepath) + f_image.save(save_dir + suffix) + + +def make_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + + +def find_all_files(root, suffix): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + print(files) + return res + + +def data_crop_resize(class_root, output_root, suffix, size=384): + all_data = find_all_files(class_root, suffix) + for data_root in all_data: + if data_root.endswith('.txt'): + continue + elif data_root.endswith('.DS_Store'): + continue + elif output_root is None: + new_data_root = (data_root + '_Lite').split('.')[0] + else: + data_name_without_suffix = os.path.split(data_root)[1].split('.')[0] + new_data_root = os.path.join(output_root, data_name_without_suffix) + + img = Image.open(data_root) + width, height = img.size # Get size + + # the cropping parameters are tuned for P_falciparum + s = min(width, height) + a = int((s * 5) / 7) + left = int((width - a) / 2) + top = int((height - a) / 2) + right = left + a + bottom = top + a + + # Crop the center of the image + img = img.crop([left, top, right, bottom]) + resized_img = img.resize((int(size), int(size)), Image.ANTIALIAS) + save_file(resized_img, new_data_root) + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + + input_root = args.input_root + output_root = args.output_root + suffix = args.suffix + size = args.size + data_crop_resize(input_root, output_root, suffix, size) \ No newline at end of file diff --git a/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_1_Microscope_P_uninfected.py b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_1_Microscope_P_uninfected.py new file mode 100644 index 0000000000000000000000000000000000000000..372a7739d8152d750f24902396178f25a227ecde --- /dev/null +++ b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_1_Microscope_P_uninfected.py @@ -0,0 +1,101 @@ +""" +CPIA_ROI_1_Microscope_P_uninfected.py ver 23.6.8 +This code aims to crop each Microscope image by the largest center square, keeping the black surroundings out of the +final image. +Also, all the images will be resized into 384*384 (or other size you want). +""" +import argparse +import os +import PIL.Image as Image +from PIL import ImageFile + +ImageFile.LOAD_TRUNCATED_IMAGES = True +Image.MAX_IMAGE_PIXELS = None + + +def get_args_parser(): + parser = argparse.ArgumentParser('CPIA dataset ROI part image cropping and resizing', add_help=False) + parser.add_argument('--input_root', default='..', type=str, + help='The root that contains the orginal images. Please make sure that there is no unwanted ' + 'images with corresponding suffix under the same root') + parser.add_argument('--output_root', default=None, type=str, + help='The root for the resized and cropped output images. If the root is not provided, this ' + 'program will automatically make an output path') + parser.add_argument('--suffix', default='jpg', type=str, + help='The suffix of the input image') + parser.add_argument('--size', default=384, type=int, + help='The size of the output image') + return argparse + + +def save_file(f_image, save_dir, suffix='.jpg'): + """ + Save images with designated suffix + """ + f_image = f_image.convert('RGB') + filepath, _ = os.path.split(save_dir) + if not os.path.exists(filepath): + os.makedirs(filepath) + f_image.save(save_dir + suffix) + + +def make_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + + +def find_all_files(root, suffix): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + print(files) + return res + + +def data_crop_resize(class_root, output_root, suffix, size=384): + all_data = find_all_files(class_root, suffix) + for data_root in all_data: + if data_root.endswith('.txt'): + continue + elif data_root.endswith('.DS_Store'): + continue + elif output_root is None: + new_data_root = (data_root + '_Lite').split('.')[0] + else: + data_name_without_suffix = os.path.split(data_root)[1].split('.')[0] + new_data_root = os.path.join(output_root, data_name_without_suffix) + + img = Image.open(data_root) + width, height = img.size # Get size + + # the cropping parameters are tuned for P_uninfected + s = min(width, height) + a = int((s * 5) / 7) + da = int(s / 14) + top = int(s / 7) + 2 * da + bottom = top + a - 2 * da + + left = int(s / 7) + 4 * da + right = left + a - 4 * da + + # Crop the center of the image + img = img.crop([left, top, right, bottom]) + resized_img = img.resize((int(size), int(size)), Image.ANTIALIAS) + save_file(resized_img, new_data_root) + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + + input_root = args.input_root + output_root = args.output_root + suffix = args.suffix + size = args.size + data_crop_resize(input_root, output_root, suffix, size) \ No newline at end of file diff --git a/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_1_Microscope_P_vivax.py b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_1_Microscope_P_vivax.py new file mode 100644 index 0000000000000000000000000000000000000000..aa386793d6c060da2cfdaba01d3852dce756c193 --- /dev/null +++ b/PuzzleTuning/dataprocessing/CPIA-main/ROI/CPIA_ROI_1_Microscope_P_vivax.py @@ -0,0 +1,100 @@ +""" +CPIA_ROI_1_Microscope_P_vivax.py ver 23.6.8 +This code aims to crop each Microscope image by the largest center square, keeping the black surroundings out of the +final image. +Also, all the images will be resized into 384*384 (or other size you want). +""" +import argparse +import os +import PIL.Image as Image +from PIL import ImageFile +ImageFile.LOAD_TRUNCATED_IMAGES = True +Image.MAX_IMAGE_PIXELS = None + + +def get_args_parser(): + parser = argparse.ArgumentParser('CPIA dataset ROI part image cropping and resizing', add_help=False) + parser.add_argument('--input_root', default='..', type=str, + help='The root that contains the orginal images. Please make sure that there is no unwanted ' + 'images with corresponding suffix under the same root') + parser.add_argument('--output_root', default=None, type=str, + help='The root for the resized and cropped output images. If the root is not provided, this ' + 'program will automatically make an output path') + parser.add_argument('--suffix', default='jpg', type=str, + help='The suffix of the input image') + parser.add_argument('--size', default=384, type=int, + help='The size of the output image') + return argparse + + +def save_file(f_image, save_dir, suffix='.jpg'): + """ + Save images with designated suffix + """ + f_image = f_image.convert('RGB') + filepath, _ = os.path.split(save_dir) + if not os.path.exists(filepath): + os.makedirs(filepath) + f_image.save(save_dir + suffix) + + +def make_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + + +def find_all_files(root, suffix): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + print(files) + return res + + +def data_crop_resize(class_root, output_root, suffix, size=384): + all_data = find_all_files(class_root, suffix) + for data_root in all_data: + if data_root.endswith('.txt'): + continue + elif data_root.endswith('.DS_Store'): + continue + elif output_root is None: + new_data_root = (data_root + '_Lite').split('.')[0] + else: + data_name_without_suffix = os.path.split(data_root)[1].split('.')[0] + new_data_root = os.path.join(output_root, data_name_without_suffix) + + img = Image.open(data_root) + width, height = img.size # Get size + + # the cropping parameters are tuned for P_vivax + s = min(width, height) + a = int((s * 5) / 7) + da = int(s / 14) + top = int(s / 7) + da + bottom = top + a - 2 * da + + left = int(s / 7) + 4 * da + right = left + a - 2 * da + + # Crop the center of the image + img = img.crop([left, top, right, bottom]) + resized_img = img.resize((int(size), int(size)), Image.ANTIALIAS) + save_file(resized_img, new_data_root) + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + + input_root = args.input_root + output_root = args.output_root + suffix = args.suffix + size = args.size + data_crop_resize(input_root, output_root, suffix, size) \ No newline at end of file diff --git a/PuzzleTuning/dataprocessing/CPIA-main/ROI/MicroscopeCrop.py b/PuzzleTuning/dataprocessing/CPIA-main/ROI/MicroscopeCrop.py new file mode 100644 index 0000000000000000000000000000000000000000..966faeaeaba5812ef8964816cf23c7bb927481ae --- /dev/null +++ b/PuzzleTuning/dataprocessing/CPIA-main/ROI/MicroscopeCrop.py @@ -0,0 +1,118 @@ +import os +import PIL.Image as Image +from PIL import ImageFile +ImageFile.LOAD_TRUNCATED_IMAGES = True +Image.MAX_IMAGE_PIXELS = None + + +def save_file(f_image, save_dir, suffix='.jpg'): + """ + Save images with designated suffix + """ + f_image = f_image.convert('RGB') + filepath, _ = os.path.split(save_dir) + if not os.path.exists(filepath): + os.makedirs(filepath) + f_image.save(save_dir + suffix) + + +def make_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + + +def find_all_files(root, suffix): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + print(files) + return res + + +def center_crop(img_size): + """ + Return the cropping zone of a non-square image + :param img_size: img.size + :return: list that contains the cropping zone + """ + width, height = img_size # Get dimensions + a = min(width, height) + left = int((width - a) / 2) + top = int((height - a) / 2) + right = left + a + bottom = top + a + + return [left, top, right, bottom] + + +def data_crop_resize(class_root, output_root, suffix, size=384): + all_data = find_all_files(class_root, suffix) + for data_root in all_data: + if data_root.endswith('.txt'): + continue + elif data_root.endswith('.DS_Store'): + continue + elif output_root is None: + new_data_root = (data_root + '_Lite').split('.')[0] + # specially made for GS dataset: + """new_data_root = (data_root + '_Lite').replace('.', '_') + new_data_root = new_data_root.replace('_jpg', '')""" + else: + data_name_without_suffix = os.path.split(data_root)[1].split('.')[0] + new_data_root = os.path.join(output_root, data_name_without_suffix) + + + img = Image.open(data_root) + width, height = img.size # Get dimensions + s = min(width, height) + """left = int((width - a) / 2) + top = int((height - a) / 2) + right = left + a + bottom = top + a + this is DataBiox + """ + + """a = int((s * 5) / 7) + da = int(s / 14) + top = int(s / 7) + da + bottom = top + a - 2 * da + + left = int(s / 7) + 4*da + right = left + a - 2 * da + this is for P.vivax""" + + """a = int((s * 5) / 7) + da = int(s / 14) + top = int(s / 7) + 3* da + bottom = top + a - 4 * da + + left = int(s / 7) + 4 * da + right = left + a - 2 * da + this is for P_falciparum""" + + a = int((s * 5) / 7) + da = int(s / 14) + top = int(s / 7) + 2 * da + bottom = top + a - 2 * da + + left = int(s / 7) + 4 * da + right = left + a - 4 * da + + + # Crop the center of the image + img = img.crop([left, top, right, bottom]) + resized_img = img.resize((int(size), int(size)), Image.ANTIALIAS) + save_file(resized_img, new_data_root) + + +if __name__ == '__main__': + data_crop_resize(r'F:\Puzzle Tuning Datasets\P.uninfected(NIH-NLM-ThickBloodSmearsU)\NIH-NLM-ThickBloodSmearsU\Uninfected Patients', + r'D:\CPIA_VersionJournal\CPIA_MJ\S\P_uninfected', + 'tiff', + 384) \ No newline at end of file diff --git a/PuzzleTuning/dataprocessing/CPIA-main/WSI/CPIA_WSI.py b/PuzzleTuning/dataprocessing/CPIA-main/WSI/CPIA_WSI.py new file mode 100644 index 0000000000000000000000000000000000000000..861951f5951a11e392bf2e2920d52b3560236e39 --- /dev/null +++ b/PuzzleTuning/dataprocessing/CPIA-main/WSI/CPIA_WSI.py @@ -0,0 +1,500 @@ +""" +CPIA_WSI.py ver: 23 Nov 2 +This code aims to split each whole slide image into standardised patches. +The patch sizes are: 3840, 960, 384, 96 +""" +import os +import PIL.Image as Image +import numpy as np +import openslide +import torch +from PIL import ImageFile +import pandas as pd +import argparse +from multiprocessing import Pool, cpu_count +from tqdm import tqdm +ImageFile.LOAD_TRUNCATED_IMAGES = True +Image.MAX_IMAGE_PIXELS = None + +STANDARD_MPP = 0.4942 +patch_size = [(3840, 3840), (960, 960), (384, 384), (96, 96)] + + +def run_map_mp(func, argument_list, num_processes='', is_tqdm=True): + """Multi-threading with progress bar + Ref: https://zhuanlan.zhihu.com/p/359369130 + + Args: + func (func): Target function. + argument_list (list): Argument list. Example format: [(a1, b1), (a2, b2)] + num_processes (str, optional): The number of processes. Defaults to the number of threads - 3. + is_tqdm (bool, optional): Whether to display progress bar (using tqdm). Defaults to True. + + Returns: + result_list_tqdm: Output list of each thread. + """ + result_list_tqdm = [] + try: + if not num_processes: + num_processes = min(cpu_count() - 3, len(argument_list)) + pool = Pool(processes=num_processes) + print('start running multiprocess using {} threads'.format(num_processes)) + + # Use pool.starmap to allow multi-parameter for func + if is_tqdm: + + # Here because starmap can only return result after fully finished, it is not capable to + # operate with tqdm progress bar. + + # In this case, I update the progress bar every num_processes processes, which may slow down + # the process a bit but enable observation. + + pbar = tqdm(total=len(argument_list)) + idx = 0 + for idx in range(0, len(argument_list) // num_processes + 1): + for result in pool.starmap(func=func, iterable=argument_list[idx*num_processes : min((idx+1)*num_processes, len(argument_list))]): + result_list_tqdm.append(result) + pbar.update(min(num_processes, len(argument_list)-idx*num_processes)) + idx += 1 + else: + for result in pool.starmap(func=func, iterable=argument_list): + result_list_tqdm.append(result) + pool.close() + pool.join() + + except: + result_list_tqdm = list(map(func, argument_list)) + return result_list_tqdm + + +def save_file(f_image, save_dir, suffix='.jpg'): + """ + Save images with designated suffix + """ + filepath, _ = os.path.split(save_dir) + if not os.path.exists(filepath): + os.makedirs(filepath) + f_image.save(save_dir + suffix) + + +def make_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + + +def find_all_files(root, suffix=None): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + if type(suffix) is tuple or type(suffix) is list: + for root, _, files in os.walk(root): + for f in files: + if suffix is not None: + status = 0 + for i in suffix: + if not f.endswith(i): + pass + else: + status = 1 + break + if status == 0: + continue + res.append(os.path.join(root, f)) + return res + + elif type(suffix) is str or suffix is None: + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + return res + + else: + print('type of suffix is not legal :', type(suffix)) + return -1 + + +def convert_to_npy_no_opening(patch, patch_size=(960, 960)): + """ + Convert the image into numpy format; + The numpy size is slightly cropped + :param patch: the patch to be converted + :param patch_size: the required image size + :return: + """ + patch_size = to_2tuple(patch_size) + img = patch + w, h = img.size + factor = min(w // patch_size[0], h // patch_size[1]) + numpy_img = img.crop([0, 0, factor * patch_size[0], factor * patch_size[1]]) + numpy_img = np.array(numpy_img) + + return numpy_img + + +class to_patch: + """ + Split an image into patches, each patch with the size of patch_size + """ + def __init__(self, patch_size=(16, 16)): + patch_size = to_2tuple(patch_size) + self.patch_h = patch_size[0] + self.patch_w = patch_size[1] + + def __call__(self, x): + x = torch.tensor(x) + x = x.permute(2, 0, 1) + c, h, w = x.shape + # print(x.shape) + # assert h // self.patch_h == h / self.patch_h and w // self.patch_w == w / self.patch_w + num_patches = (h // self.patch_h) * (w // self.patch_w) + + h_1 = (h // self.patch_h) * self.patch_h + w_1 = (w // self.patch_w) * self.patch_w + x = x[:, ((h - h_1) // 2):((h - h_1) // 2 + h_1), ((w - w_1) // 2):((w - w_1) // 2 + w_1)] + # patch encoding + # (c, h, w) + # -> (c, h // self.patch_h, self.patch_h, w // self.patch_w, self.patch_w) + # -> (h // self.patch_h, w // self.patch_w, self.patch_h, self.patch_w, c) + # -> (n_patches, patch_size^2*c) + patches = x.view( + c, + h // self.patch_h, + self.patch_h, + w // self.patch_w, + self.patch_w).permute(1, 3, 2, 4, 0).reshape(num_patches, -1) # it can also used in transformer Encoding + + # patch split + # (n_patches, patch_size^2*c) + # -> (num_patches, self.patch_h, self.patch_w, c) + # -> (num_patches, c, self.patch_h, self.patch_w) + patches = patches.view(num_patches, + self.patch_h, + self.patch_w, + c).permute(0, 3, 1, 2) + + return patches + + +def to_2tuple(input): + if type(input) is tuple: + if len(input) == 2: + return input + else: + if len(input) > 2: + output = (input[0], input[1]) + return output + elif len(input) == 1: + output = (input[0], input[0]) + return output + else: + print('cannot handle none tuple') + else: + if type(input) is list: + if len(input) == 2: + output = (input[0], input[1]) + return output + else: + if len(input) > 2: + output = (input[0], input[1]) + return output + elif len(input) == 1: + output = (input[0], input[0]) + return output + else: + print('cannot handle none list') + elif type(input) is int: + output = (input, input) + return output + else: + print('cannot handle ', type(input)) + raise ('cannot handle ', type(input)) + + +def pick_patch(patch): + """ + Pick the image patch that includes tissue information + The patch with relatively more R is to be picked + :param patch: input with numpy format + :return: bool + """ + patch = array2img(patch) + img_single = patch.resize((1, 1), Image.BILINEAR) + r, g, b = img_single.getpixel((0, 0)) + if r > 220 and g > 220 and b > 220: + return False + else: + return True + + +def array2img(patch): + img = Image.fromarray(patch.astype('uint8')).convert('RGB') + return img + + +def make_name(former_name, patch_size, patch_num): + """ + Important: each image patch's name include x, y, patch_size; + The exact location of an image patch is (x * patch_size, y * patch_size) + """ + former_patch_size = int(former_name.split('-')[-3]) + former_x = int(former_name.split('-')[-2]) + former_y = int(former_name.split('-')[-1]) + img_real_name = former_name[::-1].split('-', 3)[-1][::-1] + + ratio = int(former_patch_size / patch_size) + x = patch_num % ratio if patch_num % ratio != 0 else ratio + x = x - 1 # every coordinate starts with 0 + x = former_x * ratio + x + + y = patch_num // ratio if patch_num % ratio != 0 else patch_num // ratio - 1 + y = former_y * ratio + y + + img_name = img_real_name + '-' + str(patch_size) + '-' + str(x) + '-' + str(y) + print(img_name) + return img_name + + +def SVS_cut_to_patch(img, save_root, patch_size, + class_name, + name_dir_3840, name_dir_0, name_dir_1, name_dir_2, + patient_folder=False, + mpp=None, + L=True, M=True, S=True): + slide = openslide.open_slide(img) + img_name = os.path.split(img)[1].split('.')[0] + if mpp is None: + MPP = slide.properties[openslide.PROPERTY_NAME_MPP_X] + print(MPP, img) + else: + MPP = mpp + resize_ratio = STANDARD_MPP / float(MPP) + print(resize_ratio) + if 1.1 > resize_ratio > 0.9: + patch_size_num_0 = patch_size[0][0] + else: + patch_size_num_0 = int(patch_size[0][0] * resize_ratio) + print(patch_size_num_0) + save_root_0 = os.path.join(os.path.join(save_root, str(patch_size[0][0])), class_name + '-' + str(patch_size[0][0])) + make_path(save_root_0) + w, h = slide.level_dimensions[0] + for i in range(1, w // patch_size_num_0 - 1): + for j in range(1, h // patch_size_num_0 - 1): + + patch = slide.read_region((i * patch_size_num_0, j * patch_size_num_0), 0, + (patch_size_num_0, patch_size_num_0)) + patch = patch.convert('RGB') + if not 1.1 > resize_ratio > 0.9: + patch = patch.resize(patch_size[0], Image.ANTIALIAS) + # resize to 3840 * 3840 + img_single = patch.resize((1, 1), Image.ANTIALIAS) + r, g, b = img_single.getpixel((0, 0)) + if r < 220 and g < 220 and b < 220 and r > 100 and b > 30 and r > g + 20: + save_file(patch, os.path.join(save_root_0, + img_name + '-' + str(patch_size[0][0]) + '-' + str(i) + '-' + str(j))) + print(os.path.join(save_root_0, img_name + '-' + str(patch_size[0][0]) + '-' + str(i) + '-' + str(j))) + name_dir_3840[os.path.join(save_root_0, + img_name + '-' + str(patch_size[0][0]) + '-' + str(i) + '-' + str( + j)) + '-' + str(resize_ratio)] = img + if patient_folder is True: + save_root_patient_0 = os.path.join(save_root_0 + '-patient', img_name) + save_file(patch, os.path.join(save_root_patient_0, + img_name + '-' + str(patch_size[0][0]) + '-' + str(i) + '-' + str(j))) + Image_name_XL = img_name + '-' + str(patch_size[0][0]) + '-' + str(i) + '-' + str(j) + cut_to_patch(patch, Image_name_XL, save_root, + patch_size[1], patch_size[2], patch_size[3], + img_name, class_name, + name_dir_0, name_dir_1, name_dir_2, + patient_folder=patient_folder, + L=L, M=M, S=S) + else: + continue + pd.DataFrame.from_dict(name_dir_3840, orient='index', columns=['origin path']).to_csv( + os.path.join(os.path.join(save_root, str(patch_size[0][0])), class_name + '-' + str(patch_size[0][0]) + '.csv') + ) + + +def cut_to_patch(patch, + current_img_name, + save_root, + patch_size_0, patch_size_1, patch_size_2, + img_name, class_name, + name_dir_0, name_dir_1, name_dir_2, + patient_folder=True, + L=True, M=True, S=False + ): + current_img_name = current_img_name + numpy_img = convert_to_npy_no_opening(patch) + patch_size_num_0 = patch_size_0[0] + patch_size_num_1 = patch_size_1[0] + patch_size_num_2 = patch_size_2[0] + save_root_0 = os.path.join(os.path.join(save_root, str(patch_size_num_0)), class_name + '-' + str(patch_size_num_0)) + save_root_1 = os.path.join(os.path.join(save_root, str(patch_size_num_1)), class_name + '-' + str(patch_size_num_1)) + save_root_2 = os.path.join(os.path.join(save_root, str(patch_size_num_2)), class_name + '-' + str(patch_size_num_2)) + + save_root_patient_0 = os.path.join(save_root_0 + '-patient', img_name) + save_root_patient_1 = os.path.join(save_root_1 + '-patient', img_name) + save_root_patient_2 = os.path.join(save_root_2 + '-patient', img_name) + + img_split_0 = to_patch(patch_size_0) + img_patches_0 = img_split_0(numpy_img) + + img_split_1 = to_patch(patch_size_1) + img_patches_1 = img_split_1(numpy_img) + i = 0 + j = 0 + if L: + # on most cases we need L-scale, which is 960 * 960 + for patch in img_patches_0: + i = i + 1 + patch = patch.permute(1, 2, 0) + patch = patch.numpy() + if pick_patch(patch): + img_name_0 = make_name(current_img_name, patch_size_num_0, i) + save_dir_0 = os.path.join(save_root_0, img_name_0) + print(save_dir_0) + patch = array2img(patch) + # patch = patch.resize((384, 384), Image.ANTIALIAS) # 归为384*384 + # for our biggest CPIA we dont want to resize + if patient_folder: + save_file(patch, os.path.join(save_root_patient_0, img_name_0)) + name_dir_0[save_dir_0] = '1' + save_file(patch, save_dir_0) + else: + pass + if M: + # on most cases we need M-scale, which is 384 * 384 + # if M is false then S must be false + for patch_1 in img_patches_1: + # convert the image into numpy + j = j + 1 + patch_1 = patch_1.permute(1, 2, 0) + patch_1 = patch_1.numpy() + if pick_patch(patch_1): + # save 384*384 image + img_name_1 = make_name(current_img_name, patch_size_num_1, j) + save_dir_1 = os.path.join(save_root_1, img_name_1) + print(save_dir_1) + if S: + k = 0 + img_split_2 = to_patch(patch_size_2) + img_patches_2 = img_split_2(patch_1) + for patch_2 in img_patches_2: + k = k + 1 + patch_2 = patch_2.permute(1, 2, 0) + patch_2 = patch_2.numpy() + if pick_patch(patch_2): + if k % 10 == 0: + # down sampling + img_name_2 = make_name(img_name_1, patch_size_num_2, k) + patch_2 = array2img(patch_2) + save_dir_2 = os.path.join(save_root_2, img_name_2) + print(save_dir_2) + if patient_folder: + save_file(patch_2, os.path.join(save_root_patient_2, img_name_2)) + name_dir_2[save_dir_2] = '1' + save_file(patch_2, save_dir_2) + else: + pass + + patch_1 = array2img(patch_1) + if patient_folder: + save_file(patch_1, os.path.join(save_root_patient_1, img_name_1)) + name_dir_1[save_dir_1] = '1' + save_file(patch_1, save_dir_1) + else: + pass + pd.DataFrame.from_dict(name_dir_0, orient='index', columns=['origin path']).to_csv( + os.path.join(os.path.join(save_root, + str(patch_size_num_0)), class_name + '-' + str(patch_size_num_0) + '.csv') + ) + pd.DataFrame.from_dict(name_dir_1, orient='index', columns=['origin path']).to_csv( + os.path.join(os.path.join(save_root, + str(patch_size_num_1)), class_name + '-' + str(patch_size_num_1) + '.csv') + ) + pd.DataFrame.from_dict(name_dir_2, orient='index', columns=['origin path']).to_csv( + os.path.join(os.path.join(save_root, + str(patch_size_num_2)), class_name + '-' + str(patch_size_num_2) + '.csv') + ) + + +def read_and_convert(data_root, save_root, suffix, patient_folder, mpp, resume, resume_dataset, resume_WSI, L, M, S): + + class_names = os.listdir(data_root) + print(class_names) + + for class_name in class_names: + svs_class_root = os.path.join(data_root, class_name) + svs_all_files = find_all_files(svs_class_root, suffix) + + name_dir_3840 = {} + name_dir_0 = {} + name_dir_1 = {} + name_dir_2 = {} + arg_list = [] + + for img in svs_all_files: + arg_list.append([ + img, + save_root, + patch_size, + class_name, + name_dir_3840, + name_dir_0, + name_dir_1, + name_dir_2, + patient_folder, + mpp, + L, + M, + S]) + # Run code in parallel. One process for each WSI. + run_map_mp(SVS_cut_to_patch, arg_list, is_tqdm=True) + + + + +def get_args_parser(): + parser = argparse.ArgumentParser('CPIA_WSI', add_help=False) + + parser.add_argument('--input', default='/data/WSI_1', type=str, + help='path to input dataset') + parser.add_argument('--output', default='/data-save', type=str, + help='path to output patches') + parser.add_argument('--suffix', default='svs', type=str, + help='input image suffix') + parser.add_argument('--patient', default=False, type=bool, + help='whether to generate patient folder') + parser.add_argument('--mpp', default=None, type=str, + help='for some datasets the MPP need to be provided manually') + parser.add_argument('--resume', default=False, type=bool, + help='resume the process') + parser.add_argument('--resume_dataset', default='CPTAC-BRCA', type=str, + help='path to output patches') + parser.add_argument('--resume_WSI', default='/data/WSI_1/CPTAC-BRCA/BRCA/20BR008-76d73924-1d5a-4d75-b776-1f1b2b.svs', + type=str, + help='the actual path of the WSI to be resumed') + + + + return parser + +if __name__ == '__main__': + parser = argparse.ArgumentParser('CPIA_WSI', parents=[get_args_parser()]) + args = parser.parse_args() + read_and_convert(args.input, + args.output, + args.suffix, + args.patient, + args.mpp, + args.resume, args.resume_dataset, args.resume_WSI, + L=True, M=True, S=True) + + + + + + + diff --git a/PuzzleTuning/dataprocessing/Fraction_sample_data.py b/PuzzleTuning/dataprocessing/Fraction_sample_data.py new file mode 100644 index 0000000000000000000000000000000000000000..1bd3a0a5887d68e0131eaed714f480087e63e24c --- /dev/null +++ b/PuzzleTuning/dataprocessing/Fraction_sample_data.py @@ -0,0 +1,136 @@ +""" +script ver: Aug 19th 17:40 +将MIL格式数据集的train抽取一定量部分并命名为AAAA_fraction_XX XX为抽取百分比 +""" +import os +import random +import shutil +import argparse +from multiprocessing import Pool, cpu_count + + +def setup_seed(seed): # setting up the random seed + import numpy as np + np.random.seed(seed) + random.seed(seed) + + +def make_and_clear_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + + +def sampling(file_dir, target_dir, rate, split_subset_range='ALL', CLS=False): + """ + file_dir: input dataset path + target_dir: output dataset path + rate: fraction rate + split_subset_range:'train' to sample the training only; 'ALL' to sample the training, validation and test sets + CLS: type of dataset format, True for imagefolder, False for mask+imagefolder format + """ + print('Dataset at', file_dir) + split_names = os.listdir(file_dir) + for split_name in split_names: + + if split_subset_range == 'ALL': + file_dir_train = os.path.join(file_dir, split_name) + file_dir_data = os.path.join(file_dir_train, 'data') + file_dir_mask = os.path.join(file_dir_train, 'mask') + target_dir_train = os.path.join(target_dir, split_name) + target_dir_data = os.path.join(target_dir_train, 'data') + target_dir_mask = os.path.join(target_dir_train, 'mask') + + for type in os.listdir(file_dir_data): + + make_and_clear_path(os.path.join(target_dir_data, type)) + if not CLS: + make_and_clear_path(os.path.join(target_dir_mask, type)) + path_dir = os.listdir(os.path.join(file_dir_data, type)) # 取图片的原始路径 + file_number = len(path_dir) + rate1 = rate # 自定义抽取的比例(百分制) + pick_number = int(file_number * rate1 / 100) # 按照rate比例从文件夹中取一定数量的文件 + sample1 = random.sample(path_dir, pick_number) + for name in sample1: + shutil.copyfile(os.path.join(os.path.join(file_dir_data, type), name), + os.path.join(os.path.join(target_dir_data, type), name)) + if not CLS: + shutil.copyfile(os.path.join(os.path.join(file_dir_mask, type), name), + os.path.join(os.path.join(target_dir_mask, type), name)) + + elif split_subset_range == 'train': + if split_name == 'train': + file_dir_train = os.path.join(file_dir, split_name) + file_dir_data = os.path.join(file_dir_train, 'data') + file_dir_mask = os.path.join(file_dir_train, 'mask') + target_dir_train = os.path.join(target_dir, split_name) + target_dir_data = os.path.join(target_dir_train, 'data') + target_dir_mask = os.path.join(target_dir_train, 'mask') + + for type in os.listdir(file_dir_data): + + make_and_clear_path(os.path.join(target_dir_data, type)) + if not CLS: + make_and_clear_path(os.path.join(target_dir_mask, type)) + path_dir = os.listdir(os.path.join(file_dir_data, type)) # 取图片的原始路径 + file_number = len(path_dir) + rate1 = rate # 自定义抽取的比例(百分制) + pick_number = int(file_number * rate1 / 100) # 按照rate比例从文件夹中取一定数量的文件 + sample1 = random.sample(path_dir, pick_number) + for name in sample1: + shutil.copyfile(os.path.join(os.path.join(file_dir_data, type), name), + os.path.join(os.path.join(target_dir_data, type), name)) + if not CLS: + shutil.copyfile(os.path.join(os.path.join(file_dir_mask, type), name), + os.path.join(os.path.join(target_dir_mask, type), name)) + else: + shutil.copytree(os.path.join(file_dir, split_name), os.path.join(target_dir, split_name)) + else: + print('not a valid split_list idea') + raise + + print(split_name, 'has been processed') + + return + + +def main(args): + ''' + class_dir = '/Users/munros/Desktop/ROSE_MIL' + output = r'/Users/munros/Desktop/ROSE/MIL' + rates = [10, 20, 30, 40, 50, 60, 70, 80, 90] + for rate in rates: + + file_dir = class_dir + target_dir = os.path.join(output, 'Rose_fraction_' + str(int(rate/10)) + '_MIL') + + sampling(file_dir, target_dir, rate, split_list='train', CLS=False) + ''' + Dataset_name = os.path.split(args.root)[-1].split('_')[0] + target_dir = os.path.join(args.save_root, Dataset_name + '_fraction_' + str(int(args.rate / 10)) + '_MIL') + + sampling(args.root, target_dir, args.rate, split_subset_range=args.split_subset_range, CLS=args.CLS) + + +def get_args_parser(): + parser = argparse.ArgumentParser(description='data_sampling') + parser.add_argument('--root', default='/root/autodl-tmp/datasets/ROSE_MIL', type=str, + help='the data root, not including the final list') + parser.add_argument('--save_root', default='/root/autodl-tmp/datasets', type=str, + help='the data root, not including the final list') + parser.add_argument('--rate', default=10, type=int, + help='the rate of sampling') + parser.add_argument('--split_subset_range', default='train', type=str, + help='the subset which will be sampled: ALL or train') + parser.add_argument('--CLS', default=False, type=bool, + help='the type of dataset: CLS or MIL') + + return parser + + +if __name__ == '__main__': + # setting up the random seed + setup_seed(42) + + parser = get_args_parser() + args = parser.parse_args() + main(args) diff --git a/PuzzleTuning/dataprocessing/WSI_whole_cropping.py b/PuzzleTuning/dataprocessing/WSI_whole_cropping.py new file mode 100644 index 0000000000000000000000000000000000000000..4d087a6e8248c825ef97943baa9d57332c7fd5be --- /dev/null +++ b/PuzzleTuning/dataprocessing/WSI_whole_cropping.py @@ -0,0 +1,434 @@ +""" +'JPG_cropping_960...' ver: 22 Nov 10 +Crop pathology images into patches Using average filtering to screen the useful pieces which are mostly red/purple + +Specially mod ver +maximize the efficient of cropping in different size +""" +import os + +os.add_dll_directory(r"D:\chrome_download\github220901\openslide-win64\bin") +# 注意openslide的使用需要这样 另外叫将openslide添加到PATh里面 +import openslide +import shutil +import PIL.Image as Image +import numpy as np +import openslide +import torch +from tqdm import tqdm +import cv2 +from torchvision import transforms +from PIL import ImageFile +import pandas as pd + +ImageFile.LOAD_TRUNCATED_IMAGES = True +Image.MAX_IMAGE_PIXELS = None + +STANDARD_MPP = 0.4942 +patch_size = [(3840, 3840), (960, 960), (384, 384), (96, 96)] + + +def save_file(f_image, save_dir, suffix='.jpg'): + """ + 重命名并保存图片,生成重命名的表 + """ + filepath, _ = os.path.split(save_dir) + if not os.path.exists(filepath): + os.makedirs(filepath) + # f_image.save(save_dir + suffix) + image_data = np.asarray(f_image) + cv2.imwrite(save_dir+suffix, image_data) + + +def make_and_clear_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + + +def find_all_files(root, suffix=None): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + if type(suffix) is tuple or type(suffix) is list: + for root, _, files in os.walk(root): + for f in files: + if suffix is not None: + status = 0 + for i in suffix: + if not f.endswith(i): + pass + else: + status = 1 + break + if status == 0: + continue + res.append(os.path.join(root, f)) + return res + + elif type(suffix) is str or suffix is None: + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + return res + + else: + print('type of suffix is not legal :', type(suffix)) + return -1 + + +def convert_to_npy(a_data_path, patch_size=(960, 960)): + patch_size = to_2tuple(patch_size) + + # 处理转换 + + # 传回npy + img = Image.open(a_data_path) + w, h = img.size + factor = min(w // patch_size[0], h // patch_size[1]) + numpy_img = img.crop([0, 0, factor * patch_size[0], factor * patch_size[1]]) + numpy_img = np.array(numpy_img) + + return numpy_img + + +class to_patch: + """ + Split an image into patches, each patch with the size of patch_size + """ + + def __init__(self, patch_size=(16, 16)): + patch_size = to_2tuple(patch_size) + self.patch_h = patch_size[0] + self.patch_w = patch_size[1] + + def __call__(self, x): + x = torch.tensor(x) + x = x.permute(2, 0, 1) + c, h, w = x.shape + # print(x.shape) + # assert h // self.patch_h == h / self.patch_h and w // self.patch_w == w / self.patch_w + num_patches = (h // self.patch_h) * (w // self.patch_w) + + h_1 = (h // self.patch_h) * self.patch_h + w_1 = (w // self.patch_w) * self.patch_w + x = x[:, ((h - h_1) // 2):((h - h_1) // 2 + h_1), ((w - w_1) // 2):((w - w_1) // 2 + w_1)] + # patch encoding + # (c, h, w) + # -> (c, h // self.patch_h, self.patch_h, w // self.patch_w, self.patch_w) + # -> (h // self.patch_h, w // self.patch_w, self.patch_h, self.patch_w, c) + # -> (n_patches, patch_size^2*c) + patches = x.view( + c, + h // self.patch_h, + self.patch_h, + w // self.patch_w, + self.patch_w).permute(1, 3, 2, 4, 0).reshape(num_patches, -1) # it can also used in transformer Encoding + + # patch split + # (n_patches, patch_size^2*c) + # -> (num_patches, self.patch_h, self.patch_w, c) + # -> (num_patches, c, self.patch_h, self.patch_w) + patches = patches.view(num_patches, + self.patch_h, + self.patch_w, + c).permute(0, 3, 1, 2) + + return patches + + +def to_2tuple(input): + if type(input) is tuple: + if len(input) == 2: + return input + else: + if len(input) > 2: + output = (input[0], input[1]) + return output + elif len(input) == 1: + output = (input[0], input[0]) + return output + else: + print('cannot handle none tuple') + else: + if type(input) is list: + if len(input) == 2: + output = (input[0], input[1]) + return output + else: + if len(input) > 2: + output = (input[0], input[1]) + return output + elif len(input) == 1: + output = (input[0], input[0]) + return output + else: + print('cannot handle none list') + elif type(input) is int: + output = (input, input) + return output + else: + print('cannot handle ', type(input)) + raise ('cannot handle ', type(input)) + + +def pick_patch(patch): + """ + 用于选择合适颜色的图片 + :param patch: + :return: + """ + patch = array2img(patch) + img_single = patch.resize((1, 1), Image.ANTIALIAS) + r, g, b = img_single.getpixel((0, 0)) + if r - g < 30: + return False + else: + return True + + +def array2img(patch): + img = Image.fromarray(patch.astype('uint8')).convert('RGB') + return img + + +def make_name(former_name, patch_size, patch_num): + """ + 确保每个名字 都反映原图上的横向x,纵向y,步长为自身patch_size + :param former_name: + :param patch_size: + :return: + """ + former_patch_size = int(former_name.split('-')[-3]) + former_x = int(former_name.split('-')[-2]) + former_y = int(former_name.split('-')[-1]) + img_real_name = former_name[::-1].split('-', 3)[-1][::-1] + + ratio = int(former_patch_size / patch_size) + x = patch_num % ratio if patch_num % ratio != 0 else ratio + x = x - 1 # every coordinate starts with 0 + x = former_x * ratio + x + + y = patch_num // ratio if patch_num % ratio != 0 else patch_num // ratio - 1 + y = former_y * ratio + y + + img_name = img_real_name + '-' + str(patch_size) + '-' + str(x) + '-' + str(y) + print(img_name) + return img_name + + +def SVS_cut_to_patch(img, save_root, + patch_size, + img_name, + class_name, + name_dir_3840, name_dir_0, name_dir_1, name_dir_2, + patient_folder=False, + L=True, M=True, S=False): + slide = openslide.open_slide(img) + try: + MPP = slide.properties[openslide.PROPERTY_NAME_MPP_X] + print(MPP, img) + resize_ratio = STANDARD_MPP/float(MPP) + print(resize_ratio) + if 1.1 > resize_ratio > 0.9: + patch_size_num_0 = patch_size[0][0] + else: + patch_size_num_0 = int(patch_size[0][0] * resize_ratio) + print(patch_size_num_0) + save_root_0 = os.path.join(os.path.join(save_root, str(patch_size[0][0])), class_name + '-' + str(patch_size[0][0])) + make_and_clear_path(save_root_0) + w, h = slide.level_dimensions[0] + for i in range(1, w // patch_size_num_0 - 1): + + for j in range(1, h // patch_size_num_0 - 1): + + patch = slide.read_region((i * patch_size_num_0, j * patch_size_num_0), 0, (patch_size_num_0, patch_size_num_0)) + patch = patch.convert('RGB') + # print('finish id:%d image' % image_list.index(id)) + if not 1.1 > resize_ratio > 0.9: + patch = patch.resize(patch_size[0], Image.ANTIALIAS) # resize 到 3840 3840 + # 统一归为384*384 + # save_file(patch, os.path.join(save_root_0, img_name + '-' + str((i + 1) * (j + 1)))) + img_single = patch.resize((1, 1), Image.ANTIALIAS) + r, g, b = img_single.getpixel((0, 0)) + if r < 220 and g < 220 and b < 220 and r > 100 and b > 30 and r > g + 20: + + save_file(patch, os.path.join(save_root_0, img_name + '-' + str(patch_size[0][0]) + '-' + str(i) + '-' + str(j))) + name_dir_3840[os.path.join(save_root_0, img_name + '-' + str(patch_size[0][0]) + '-' + str(i) + '-' + str(j)) + '-' + str(resize_ratio)] = img + if patient_folder is True: + save_root_patient_0 = os.path.join(save_root_0 + '-patient', img_name) + save_file(patch, os.path.join(save_root_patient_0, img_name + '-' + str(patch_size[0][0]) + '-' + str(i) + '-' + str(j))) + current_img = os.path.join(save_root_0, img_name + '-' + str(patch_size[0][0]) + '-' + str(i) + '-' + str(j)) + '.jpg' + + cut_to_patch(current_img, save_root, + patch_size[1], patch_size[2], patch_size[3], + img_name, class_name, + name_dir_0, name_dir_1, name_dir_2, + patient_folder=patient_folder, + L=L, M=M, S=S) + else: + continue + # save_file(patch, os.path.join('H:\PuzzleTuning\SNL-Breast-Back', img_name + '-' + str(i) + '-' +c + # str(j))) + pd.DataFrame.from_dict(name_dir_3840, orient='index', columns=['origin path']).to_csv( + os.path.join(os.path.join(save_root, str(patch_size[0][0])), class_name + '-' + str(patch_size[0][0]) + '.csv') + ) + + except Exception as e: + print(e) + + +def cut_to_patch(img, + save_root, + patch_size_0, patch_size_1, patch_size_2, + img_name, class_name, + name_dir_0, name_dir_1, name_dir_2, + patient_folder=True, + L=True, M=True, S=False + ): + current_img_name = os.path.split(img)[1].split('.')[0] + numpy_img = convert_to_npy(img) + patch_size_num_0 = patch_size_0[0] + patch_size_num_1 = patch_size_1[0] + patch_size_num_2 = patch_size_2[0] + save_root_0 = os.path.join(os.path.join(save_root, str(patch_size_num_0)), class_name + '-' + str(patch_size_num_0)) + save_root_1 = os.path.join(os.path.join(save_root, str(patch_size_num_1)), class_name + '-' + str(patch_size_num_1)) + save_root_2 = os.path.join(os.path.join(save_root, str(patch_size_num_2)), class_name + '-' + str(patch_size_num_2)) + + save_root_patient_0 = os.path.join(save_root_0 + '-patient', img_name) + save_root_patient_1 = os.path.join(save_root_1 + '-patient', img_name) + save_root_patient_2 = os.path.join(save_root_2 + '-patient', img_name) + + img_split_0 = to_patch(patch_size_0) + img_patches_0 = img_split_0(numpy_img) + + img_split_1 = to_patch(patch_size_1) + img_patches_1 = img_split_1(numpy_img) + i = 0 + j = 0 + if L: + # on most cases we need L-scale, which is 960 * 960 + for patch in img_patches_0: + i = i + 1 + patch = patch.permute(1, 2, 0) + patch = patch.numpy() + if pick_patch(patch): + img_name_0 = make_name(current_img_name, patch_size_num_0, i) + save_dir_0 = os.path.join(save_root_0, img_name_0) + print(save_dir_0) + patch = array2img(patch) + # patch = patch.resize((384, 384), Image.ANTIALIAS) # 归为384*384 + # for our biggest CPIA we dont want to resize + if patient_folder: + save_file(patch, os.path.join(save_root_patient_0, img_name_0)) + name_dir_0[save_dir_0] = img + # 保存相关.csv + save_file(patch, save_dir_0) + else: + pass + if M: + # on most cases we need M-scale, which is 384 * 384 + # if M is false then S must be false + for patch_1 in img_patches_1: + # convert the image into numpy + j = j + 1 + patch_1 = patch_1.permute(1, 2, 0) + patch_1 = patch_1.numpy() + if pick_patch(patch_1): + # save 384*384 image + img_name_1 = make_name(current_img_name, patch_size_num_1, j) + save_dir_1 = os.path.join(save_root_1, img_name_1) + print(save_dir_1) + if S: + # 2023.5.12 暂时不处理S + k = 0 + img_split_2 = to_patch(patch_size_2) + img_patches_2 = img_split_2(patch_1) + for patch_2 in img_patches_2: + k = k + 1 + patch_2 = patch_2.permute(1, 2, 0) + patch_2 = patch_2.numpy() + if pick_patch(patch_2): + # if k % 10 == 0: + # for our biggest CPIA we don't want sampling + img_name_2 = make_name(img_name_1, patch_size_num_2, k) + patch_2 = array2img(patch_2) + save_dir_2 = os.path.join(save_root_2, img_name_2) + print(save_dir_2) + if patient_folder: + save_file(patch_2, os.path.join(save_root_patient_2, img_name_2)) + name_dir_2[save_dir_2] = img + save_file(patch_2, save_dir_2) + else: + pass + + patch_1 = array2img(patch_1) + if patient_folder: + save_file(patch_1, os.path.join(save_root_patient_1, img_name_1)) + name_dir_1[save_dir_1] = img + save_file(patch_1, save_dir_1) + else: + pass + pd.DataFrame.from_dict(name_dir_0, orient='index', columns=['origin path']).to_csv( + os.path.join(os.path.join(save_root, + str(patch_size_num_0)), class_name + '-' + str(patch_size_num_0) + '.csv') + ) + pd.DataFrame.from_dict(name_dir_1, orient='index', columns=['origin path']).to_csv( + os.path.join(os.path.join(save_root, + str(patch_size_num_1)), class_name + '-' + str(patch_size_num_1) + '.csv') + ) + pd.DataFrame.from_dict(name_dir_2, orient='index', columns=['origin path']).to_csv( + os.path.join(os.path.join(save_root, + str(patch_size_num_2)), class_name + '-' + str(patch_size_num_2) + '.csv') + ) + + +def read_and_convert(data_root, save_root, suffix=None, patient_folder=False, L=True, M=True, S=False): + # 一次处理只一个数据集, 每个数据集的处理方式可能有不同 + + # 读入所有数据 + + class_names = os.listdir(data_root) + + class_names = ['PAIP2019'] + # 接下来一行代码只在断点续传使用 + # class_names = class_names[class_names.index('CPTAC-LUAD') :] + + for class_name in class_names: + + svs_class_root = os.path.join(data_root, class_name) + svs_all_files = find_all_files(svs_class_root, suffix) + # 接下来一行代码只在断点续传使用 + # if class_name == 'CPTAC-LUAD': + # svs_all_files = svs_all_files[svs_all_files.index(r'E:\Puzzle_Tuning_Datasets\Raw\WSI\CPTAC-LUAD\LUAD\C3N-02141-27.svs') + 1:] + name_dir_3840 = {} + name_dir_0 = {} + name_dir_1 = {} + name_dir_2 = {} + for img in svs_all_files: + img_name = os.path.split(img)[1].split('.')[0] + SVS_cut_to_patch(img, save_root, patch_size, img_name, class_name, name_dir_3840, name_dir_0, name_dir_1, name_dir_2, + patient_folder, L=L, M=M, S=S) + + + +if __name__ == '__main__': + read_and_convert(r'I:\Puzzle_Tuning_Datasets\Raw', + r'X:\CPIA_WSI_no_sampling_no_rezising', + 'svs', + patient_folder=False, + L=True, M=True, S=False) + # fixme: X: doesn't take the picture + # fixed use image_data = np.asarray(f_image) + # cv2.imwrite(save_dir+suffix, image_data) + + +# 2023.5.1 E: CPTAC-(CCRCC CM HNSCC LSCC LUAD PDA SAR UCEC) Post-NAT-BRCA + + + + + + diff --git a/PuzzleTuning/dataprocessing/WSI_whole_cropping_counting.py b/PuzzleTuning/dataprocessing/WSI_whole_cropping_counting.py new file mode 100644 index 0000000000000000000000000000000000000000..b66bc98c9255af2d68f8ac9223e7676c3beaac8f --- /dev/null +++ b/PuzzleTuning/dataprocessing/WSI_whole_cropping_counting.py @@ -0,0 +1,398 @@ +""" +'JPG_cropping_960...' ver: 23.6.2 +this code is used to count the dataset quantity of CPIA-WSI +Crop pathology images into patches Using average filtering to screen the useful pieces which are mostly red/purple + +Specially mod ver +maximize the efficient of cropping in different size +""" +import os + +os.add_dll_directory(r"D:\chrome_download\github220901\openslide-win64\bin") +# 注意openslide的使用需要这样 另外叫将openslide添加到PATh里面 +import openslide +import shutil +import PIL.Image as Image +import numpy as np +import openslide +import torch +from tqdm import tqdm +import cv2 +from torchvision import transforms +from PIL import ImageFile +import pandas as pd + +ImageFile.LOAD_TRUNCATED_IMAGES = True +Image.MAX_IMAGE_PIXELS = None + +STANDARD_MPP = 0.4942 +patch_size = [(3840, 3840), (960, 960), (384, 384), (96, 96)] + + +def save_file(f_image, save_dir, suffix='.jpg'): + """ + 重命名并保存图片,生成重命名的表 + """ + filepath, _ = os.path.split(save_dir) + if not os.path.exists(filepath): + os.makedirs(filepath) + # f_image.save(save_dir + suffix) + image_data = np.asarray(f_image) + cv2.imwrite(save_dir+suffix, image_data) + + +def make_and_clear_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + + +def find_all_files(root, suffix=None): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + if type(suffix) is tuple or type(suffix) is list: + for root, _, files in os.walk(root): + for f in files: + if suffix is not None: + status = 0 + for i in suffix: + if not f.endswith(i): + pass + else: + status = 1 + break + if status == 0: + continue + res.append(os.path.join(root, f)) + return res + + elif type(suffix) is str or suffix is None: + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + return res + + else: + print('type of suffix is not legal :', type(suffix)) + return -1 + + +def convert_to_npy(a_data_path, patch_size=(960, 960)): + patch_size = to_2tuple(patch_size) + + # 处理转换 + + # 传回npy + img = Image.open(a_data_path) + w, h = img.size + factor = min(w // patch_size[0], h // patch_size[1]) + numpy_img = img.crop([0, 0, factor * patch_size[0], factor * patch_size[1]]) + numpy_img = np.array(numpy_img) + + return numpy_img + +def convert_to_npy_no_opening(patch, patch_size=(960, 960)): + patch_size = to_2tuple(patch_size) + img = patch + w, h = img.size + factor = min(w // patch_size[0], h // patch_size[1]) + numpy_img = img.crop([0, 0, factor * patch_size[0], factor * patch_size[1]]) + numpy_img = np.array(numpy_img) + + return numpy_img + + +class to_patch: + """ + Split an image into patches, each patch with the size of patch_size + """ + + def __init__(self, patch_size=(16, 16)): + patch_size = to_2tuple(patch_size) + self.patch_h = patch_size[0] + self.patch_w = patch_size[1] + + def __call__(self, x): + x = torch.tensor(x) + x = x.permute(2, 0, 1) + c, h, w = x.shape + # print(x.shape) + # assert h // self.patch_h == h / self.patch_h and w // self.patch_w == w / self.patch_w + num_patches = (h // self.patch_h) * (w // self.patch_w) + + h_1 = (h // self.patch_h) * self.patch_h + w_1 = (w // self.patch_w) * self.patch_w + x = x[:, ((h - h_1) // 2):((h - h_1) // 2 + h_1), ((w - w_1) // 2):((w - w_1) // 2 + w_1)] + # patch encoding + # (c, h, w) + # -> (c, h // self.patch_h, self.patch_h, w // self.patch_w, self.patch_w) + # -> (h // self.patch_h, w // self.patch_w, self.patch_h, self.patch_w, c) + # -> (n_patches, patch_size^2*c) + patches = x.view( + c, + h // self.patch_h, + self.patch_h, + w // self.patch_w, + self.patch_w).permute(1, 3, 2, 4, 0).reshape(num_patches, -1) # it can also used in transformer Encoding + + # patch split + # (n_patches, patch_size^2*c) + # -> (num_patches, self.patch_h, self.patch_w, c) + # -> (num_patches, c, self.patch_h, self.patch_w) + patches = patches.view(num_patches, + self.patch_h, + self.patch_w, + c).permute(0, 3, 1, 2) + + return patches + + +def to_2tuple(input): + if type(input) is tuple: + if len(input) == 2: + return input + else: + if len(input) > 2: + output = (input[0], input[1]) + return output + elif len(input) == 1: + output = (input[0], input[0]) + return output + else: + print('cannot handle none tuple') + else: + if type(input) is list: + if len(input) == 2: + output = (input[0], input[1]) + return output + else: + if len(input) > 2: + output = (input[0], input[1]) + return output + elif len(input) == 1: + output = (input[0], input[0]) + return output + else: + print('cannot handle none list') + elif type(input) is int: + output = (input, input) + return output + else: + print('cannot handle ', type(input)) + raise ('cannot handle ', type(input)) + + +def pick_patch(patch): + """ + 用于选择合适颜色的图片 + :param patch: + :return: + """ + patch = array2img(patch) + img_single = patch.resize((1, 1), Image.ANTIALIAS) + r, g, b = img_single.getpixel((0, 0)) + if r - g < 30: + return False + else: + return True + + +def array2img(patch): + img = Image.fromarray(patch.astype('uint8')).convert('RGB') + return img + + +def make_name(former_name, patch_size, patch_num): + """ + 确保每个名字 都反映原图上的横向x,纵向y,步长为自身patch_size + :param former_name: + :param patch_size: + :return: + """ + former_patch_size = int(former_name.split('-')[-3]) + former_x = int(former_name.split('-')[-2]) + former_y = int(former_name.split('-')[-1]) + img_real_name = former_name[::-1].split('-', 3)[-1][::-1] + + ratio = int(former_patch_size / patch_size) + x = patch_num % ratio if patch_num % ratio != 0 else ratio + x = x - 1 # every coordinate starts with 0 + x = former_x * ratio + x + + y = patch_num // ratio if patch_num % ratio != 0 else patch_num // ratio - 1 + y = former_y * ratio + y + + img_name = img_real_name + '-' + str(patch_size) + '-' + str(x) + '-' + str(y) + return img_name + + +def SVS_cut_to_patch(img, save_root, + patch_size, + class_name, + patient_folder=False, + L=True, M=True, S=False): + global num_XL + + img_name = os.path.split(img)[1].split('.')[0] + slide = openslide.open_slide(img) + try: + MPP = slide.properties[openslide.PROPERTY_NAME_MPP_X] + resize_ratio = STANDARD_MPP/float(MPP) + + if 1.1 > resize_ratio > 0.9: + patch_size_num_0 = patch_size[0][0] + else: + patch_size_num_0 = int(patch_size[0][0] * resize_ratio) + + save_root_0 = os.path.join(os.path.join(save_root, str(patch_size[0][0])), class_name + '-' + str(patch_size[0][0])) + make_and_clear_path(save_root_0) + w, h = slide.level_dimensions[0] + for i in range(1, w // patch_size_num_0 - 1): + + for j in range(1, h // patch_size_num_0 - 1): + + patch = slide.read_region((i * patch_size_num_0, j * patch_size_num_0), 0, (patch_size_num_0, patch_size_num_0)) + patch = patch.convert('RGB') + # print('finish id:%d image' % image_list.index(id)) + if not 1.1 > resize_ratio > 0.9: + patch = patch.resize(patch_size[0], Image.ANTIALIAS) # resize 到 3840 3840 + # 统一归为384*384 + # save_file(patch, os.path.join(save_root_0, img_name + '-' + str((i + 1) * (j + 1)))) + img_single = patch.resize((1, 1), Image.ANTIALIAS) + r, g, b = img_single.getpixel((0, 0)) + if r < 220 and g < 220 and b < 220 and r > 100 and b > 30 and r > g + 20: + num_XL += 1 + # save_file(patch, os.path.join(save_root_0, img_name + '-' + str(patch_size[0][0]) + '-' + str(i) + '-' + str(j))) + current_img_name = img_name + '-' + str(patch_size[0][0]) + '-' + str(i) + '-' + str(j) + + cut_to_patch(patch, current_img_name, save_root, + patch_size[1], patch_size[2], patch_size[3], + img_name, class_name, + patient_folder=patient_folder, + L=L, M=M, S=S) + else: + continue + + except Exception as e: + print(e) + + +def cut_to_patch(patch, + current_img_name, + save_root, + patch_size_0, patch_size_1, patch_size_2, + img_name, class_name, + patient_folder=True, + L=True, M=True, S=True + ): + global num_L, num_M, num_S + current_img_name = current_img_name + numpy_img = convert_to_npy_no_opening(patch) + patch_size_num_0 = patch_size_0[0] + patch_size_num_1 = patch_size_1[0] + patch_size_num_2 = patch_size_2[0] + + img_split_0 = to_patch(patch_size_0) + img_patches_0 = img_split_0(numpy_img) + + img_split_1 = to_patch(patch_size_1) + img_patches_1 = img_split_1(numpy_img) + i = 0 + j = 0 + if L: + # on most cases we need L-scale, which is 960 * 960 + for patch in img_patches_0: + i = i + 1 + patch = patch.permute(1, 2, 0) + patch = patch.numpy() + if pick_patch(patch): + img_name_0 = make_name(current_img_name, patch_size_num_0, i) + num_L += 1 + else: + pass + if M: + # on most cases we need M-scale, which is 384 * 384 + # if M is false then S must be false + for patch_1 in img_patches_1: + # convert the image into numpy + j = j + 1 + patch_1 = patch_1.permute(1, 2, 0) + patch_1 = patch_1.numpy() + if pick_patch(patch_1): + # save 384*384 image + num_M += 1 + if S: + k = 0 + img_split_2 = to_patch(patch_size_2) + img_patches_2 = img_split_2(patch_1) + for patch_2 in img_patches_2: + k = k + 1 + patch_2 = patch_2.permute(1, 2, 0) + patch_2 = patch_2.numpy() + if pick_patch(patch_2): + if k % 10 == 0: + num_S += 1 + + else: + pass + else: + pass + + +def read_and_convert(data_root, save_root, suffix=None, L=True, M=True, S=True): + global num_XL, num_L, num_M, num_S + dataset_list = [] + # class_names = os.listdir(data_root) + class_names = ['tif'] + # 接下来一行代码只在断点续传使用 + # class_names = class_names[class_names.index('CPTAC-UCEC') :] + + + + for class_name in class_names: + + svs_class_root = os.path.join(data_root, class_name) + svs_all_files = find_all_files(svs_class_root, suffix) + + num_XL = 0 + num_L = 0 + num_M = 0 + num_S = 0 + for seq in tqdm(range(len(svs_all_files))): + img = svs_all_files[seq] + SVS_cut_to_patch(img, save_root, patch_size, class_name, + patient_folder=True, L=L, M=M, S=S) + print({'dataset_name': str(class_name), + 'num_XL': int(num_XL), + 'num_L': int(num_L), + 'num_M': int(num_M), + 'num_S': int(num_S)}) + + dataset_list.append( + {'dataset_name': str(class_name), + 'num_XL': int(num_XL), + 'num_L': int(num_L), + 'num_M': int(num_M), + 'num_S': int(num_S)} + ) + + print(dataset_list) + + +if __name__ == '__main__': + read_and_convert(r'F:\MIL_datasets\CAMELYON16\training', + r'X:\CPIA_WSI_no_sampling_no_rezising', + 'tif', + L=True, M=True, S=True) + # fixme: X: doesn't take the picture + # fixed use image_data = np.asarray(f_image) + # cv2.imwrite(save_dir+suffix, image_data) + + + + + + diff --git a/PuzzleTuning/dataprocessing/bad_data_killer.py b/PuzzleTuning/dataprocessing/bad_data_killer.py new file mode 100644 index 0000000000000000000000000000000000000000..3a59ada71a9d90c7f398bb5e4e746623a52efb6b --- /dev/null +++ b/PuzzleTuning/dataprocessing/bad_data_killer.py @@ -0,0 +1,52 @@ +""" +datacheck via dataloader Script ver: Feb 23th 21:00 +loop the data and check if they are all cool +""" +import time +import torch +from torch import nn, optim +from torch.utils.data import DataLoader +from torchvision import models, datasets, transforms +import torch.nn.functional as func +from torchsummary import summary +import matplotlib.pyplot as plt +from torchvision import models +import ssl +import os + +ssl._create_default_https_context = ssl._create_unverified_context + + +def data_loop(device, train_loader, check_minibatch=100): + model_time = time.time() + prev_time = model_time + index = 0 + + for data, label in train_loader: + data = data.to(device) + + # at the checking time now + if index % check_minibatch == check_minibatch - 1: + check_index = index // check_minibatch + 1 + now_time = time.time() + gap_time = now_time - prev_time + prev_time = now_time + print('index of ' + str(check_minibatch) + ' minibatch:', check_index, ' time used:', gap_time) + + index += 1 + + print('all checked, time used:', time.time() - model_time) + + +if __name__ == '__main__': + data_path = r'/root/autodl-tmp/datasets/L' + edge_size = 224 + transform_train = transforms.Compose([transforms.Resize([edge_size, edge_size]),transforms.ToTensor()]) + + train_data = datasets.ImageFolder(data_path, transform=transform_train) + train_loader = DataLoader(train_data, batch_size=500, shuffle=False, num_workers=32) + + os.environ['CUDA_VISIBLE_DEVICES'] = '0' + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + data_loop(device, train_loader) diff --git a/PuzzleTuning/dataprocessing/data_split.py b/PuzzleTuning/dataprocessing/data_split.py new file mode 100644 index 0000000000000000000000000000000000000000..feebb73a7ae31a86021ae17a4409aeeb2ede6a72 --- /dev/null +++ b/PuzzleTuning/dataprocessing/data_split.py @@ -0,0 +1,215 @@ +""" +dataset divide script ver: Jan 9th 15:30 official release + +ref:https://zhuanlan.zhihu.com/p/199238910 +""" +import os +import random +import shutil +from shutil import copy2 +from multiprocessing import Pool, cpu_count + + +def del_file(filepath): + """ + Delete all files or folders in a directory + :param filepath: path of file + :return: + """ + del_list = os.listdir(filepath) + for f in del_list: + file_path = os.path.join(filepath, f) + if os.path.isfile(file_path): + os.remove(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + + +def make_and_clear_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + del_file(file_pack_path) + + +def a_dataset_split(src_data_folder, target_data_folder, class_name, train_scale, val_scale, test_scale, com_num=None): + current_class_data_path = os.path.join(src_data_folder, class_name) + current_all_data = os.listdir(current_class_data_path) + + current_data_length = len(current_all_data) + current_data_index_list = list(range(current_data_length)) + random.shuffle(current_data_index_list) + + train_folder = os.path.join(os.path.join(target_data_folder, 'train'), class_name) + val_folder = os.path.join(os.path.join(target_data_folder, 'val'), class_name) + test_folder = os.path.join(os.path.join(target_data_folder, 'test'), class_name) + + train_stop_flag = current_data_length * train_scale + val_stop_flag = current_data_length * (train_scale + val_scale) + current_idx = 0 + train_num = 0 + val_num = 0 + test_num = 0 + for i in current_data_index_list: + src_img_path = os.path.join(current_class_data_path, current_all_data[i]) + if current_idx <= train_stop_flag: + copy2(src_img_path, train_folder) + # print("{} copied to {}".format(src_img_path, train_folder)) + train_num = train_num + 1 + + elif (current_idx > train_stop_flag) and (current_idx <= val_stop_flag): + copy2(src_img_path, val_folder) + # print("{} copied to{}".format(src_img_path, val_folder)) + val_num = val_num + 1 + + else: + copy2(src_img_path, test_folder) + # print("{} copied to {}".format(src_img_path, test_folder)) + test_num = test_num + 1 + + current_idx = current_idx + 1 + + print("*********************************{}*************************************".format(class_name) + '\n' + + "{} class has been divided into {}:{}:{}, a total of {} images".format(class_name, train_scale, val_scale, + test_scale, + current_data_length) + + '\n' + "Train set{}: {} pics".format( + train_folder, + train_num) + + '\n' + "Validation set{}: {} pics".format(val_folder, val_num) + '\n' + "Test set{}: {} pics".format( + test_folder, test_num) + + '\n') + + if com_num is not None: + print('processed class idx:', com_num) + + +def data_set_split(src_data_folder, target_data_folder='./dataset', train_scale=0.8, val_scale=0.2, test_scale=0.0, + Parallel_processing=False): + """ + Read source data folder, generate divided folders as 'train', 'val' and 'test' + :param src_data_folder: source folder E:/biye/gogogo/note_book/torch_note/data/utils_test/data_split/src_data + :param target_data_folder: target folder E:/biye/gogogo/note_book/torch_note/data/utils_test/data_split/target_data + :param train_scale: train set ratio + :param val_scale: validation set ratio + :param test_scale: test set ratio + + :param Parallel_processing: whether to process in parallel + + :return: + """ + make_and_clear_path(target_data_folder) + print("Begin dataset division") + class_names = os.listdir(src_data_folder) + # Create folder in the target directory + split_names = ['train', 'val', 'test'] + for split_name in split_names: + split_path = os.path.join(target_data_folder, split_name) + # Then create category folder under the split_path directory + for class_name in class_names: + class_split_path = os.path.join(split_path, class_name) + os.makedirs(class_split_path) + + if Parallel_processing: + # Create process pool + tasks_num = len(class_names) + process_pool = Pool(min(cpu_count() - 2, tasks_num)) # Number of parallels, leave at least 2 cores + + com_num = 0 + print("start processing" + str(tasks_num) + " files by multi-process") + # Schedule tasks + for class_name in class_names: + # Pool.apply_async(target to be called,(parameter tuple passed to the target,)) + # Use free process to call the target during each loop + com_num += 1 + args = (src_data_folder, target_data_folder, class_name, train_scale, val_scale, test_scale, com_num) + process_pool.apply_async(a_dataset_split, args) + + process_pool.close() # Close the process pool, process pool will no longer receive new requests once it is closed. + process_pool.join() # Wait till all process in process pool finished, must be placed after the 'close' statement + + else: + # Divide the dataset according to the proportion, and copy the data image + # Traverse by category + for class_name in class_names: + a_dataset_split(src_data_folder, target_data_folder, class_name, train_scale, val_scale, test_scale) + + +def k_fold_split(src_data_folder, target_data_folder='./kfold', k=5): + """ + Read the source data folder, generate divided folders as 'train', 'val'. + + :param src_data_folder: organized imagenet format folders that need to be divided by k-folding + :param target_data_folder: large target folder with k folders generated inside, k folders are in imagenet format with train and val inside + :param k: the number of divided folds + + :return: + """ + make_and_clear_path(target_data_folder) + print("Begin dataset division") + class_names = os.listdir(src_data_folder) # Get category name + + # Divide the dataset for each category according to the proportion, and copy and distribute the data images + for class_name in class_names: # Classification traversal first + + current_class_data_path = os.path.join(src_data_folder, class_name) + current_class_data_names = os.listdir(current_class_data_path) + + current_data_length = len(current_class_data_names) + random.shuffle(current_class_data_names) + + # Divide data + split_num = current_data_length // k + # Put a packet for evert split_num data, and if there are k+1 packets, the last packet can only have k-1 data at most + temp_split_pack = [current_class_data_names[i:i + split_num] for i in range(0, current_data_length, split_num)] + fold_name_pack = [temp_split_pack[i] for i in range(0, k)] # Get the first k packets + if len( + temp_split_pack) > k: # If it can’t be divided equally at the end, the last one will have one more pack, and put the contents into different packs in turn + for pack_idx, name in enumerate(temp_split_pack[-1]): # The extra pack have at most k-1 data + fold_name_pack[pack_idx].append(name) + + print("{} class is divided into {} cross-validation, a total of {} images".format(class_name, k, + current_data_length)) + + for p in range(1, k + 1): # For each fold, start from 1 + # Folder + train_folder = os.path.join(target_data_folder, 'fold_' + str(p), 'train', class_name) + val_folder = os.path.join(target_data_folder, 'fold_' + str(p), 'val', class_name) + os.makedirs(train_folder) + os.makedirs(val_folder) + + pack_idx = p - 1 # Use the current fold of data as val set, and use the rest as train set + + # Copy divided data + train_num = 0 + val_num = 0 + + for j in range(k): + if j == pack_idx: + for i in fold_name_pack[j]: + src_img_path = os.path.join(current_class_data_path, i) + copy2(src_img_path, val_folder) + val_num += 1 + # print("{} has copied to {}".format(src_img_path, val_folder)) + else: + for i in fold_name_pack[j]: + src_img_path = os.path.join(current_class_data_path, i) + copy2(src_img_path, train_folder) + train_num += 1 + # print("{} has copied to {}".format(src_img_path, train_folder)) + print("fold {}: class:{} train num: {}".format(p, class_name, train_num)) + print("fold {}: class:{} val num: {}".format(p, class_name, val_num)) + + +if __name__ == '__main__': + # step1: create train_val and test dataset + src_data_folder = r'C:\Users\admin\Desktop\ROSE_5k' + target_data_folder1 = r'C:\Users\admin\Desktop\ROSE_5000_train_val' # _5fold + data_set_split(src_data_folder, target_data_folder1, train_scale=0.8, val_scale=0.0, test_scale=0.2, + Parallel_processing=False) + + # step2: create 5 fold dataset + src_data_folder = os.path.join(target_data_folder1, 'train') + target_data_folder2 = r'C:\Users\admin\Desktop\ROSE_5000_5fold' # + k_fold_split(src_data_folder, target_data_folder2, k=5) + + # step3: move the test dataset into file folder of the 5 fold dataset diff --git a/PuzzleTuning/dataprocessing/database_generator.py b/PuzzleTuning/dataprocessing/database_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..59ef9644852d0ffe79ce540af7042c6d363b0683 --- /dev/null +++ b/PuzzleTuning/dataprocessing/database_generator.py @@ -0,0 +1,164 @@ +""" +Organize the data to ensure that all data is in jpg format ver: Jan 9th 15:30 official release + +""" +import os +import re +import csv +import shutil +import pandas as pd +from PIL import Image +from tqdm import tqdm +import torchvision.transforms +from PIL import ImageFile + +ImageFile.LOAD_TRUNCATED_IMAGES = True + + +def del_file(filepath): + """ + Delete all files and folders in one directory + :param filepath: file path + :return: + """ + del_list = os.listdir(filepath) + for f in del_list: + file_path = os.path.join(filepath, f) + if os.path.isfile(file_path): + os.remove(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + + +def make_and_clear_path(file_pack_path): + if not os.path.exists(file_pack_path): + os.makedirs(file_pack_path) + del_file(file_pack_path) + + +def find_all_files(root, suffix=None): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + return res + + +def read_file(f_dir): + """ + Read a file and convert it into numpy format + """ + f_image = Image.open(f_dir) + return f_image + + +def change_shape(image, corp_x=2400, corp_y=1800, f_x=1390, f_y=1038): + """ + Resize the image into x*y + """ + if image.size[0] > corp_x or image.size[1] > corp_y: + # Generate an object of CenterCrop class to crop the image from the center into corp_x*corp_y + crop_obj = torchvision.transforms.CenterCrop((corp_y, corp_x)) + image = crop_obj(image) + # print(image.size[0], image.size[1]) + + image.thumbnail((f_x, f_y), Image.ANTIALIAS) + return image + + +def save_file(f_image, save_dir, suffix='.jpg'): + """ + Save and rename the images, generate the renamed table + """ + filepath, _ = os.path.split(save_dir) + if not os.path.exists(filepath): + os.makedirs(filepath) + f_image.save(save_dir + suffix) + + +def PC_to_stander(root_from=r'C:\Users\admin\Desktop\dataset\PC', + root_positive=r'C:\Users\admin\Desktop\jpg_dataset\P', + root_negative=r'C:\Users\admin\Desktop\jpg_dataset\N', corp_x=2400, corp_y=1800, f_x=1390, f_y=1038): + root_target, _ = os.path.split(root_positive) + make_and_clear_path(root_target) + + f_dir_list = find_all_files(root=root_from, suffix='.jpg') + # print(f_dir_list) + + name_dict = {} # Save the new and old names + old_size_type = [] + size_type = [] # Record all different image sizes (after reshape) + + for seq in tqdm(range(len(f_dir_list))): + f_dir = f_dir_list[seq] + + if '非癌' in f_dir or '阴性' in f_dir or '良性' in f_dir: + root_target = root_negative + else: + root_target = root_positive + + f_image = read_file(f_dir) + + size = (f_image.size[0], f_image.size[1]) + if size not in old_size_type: + old_size_type.append(size) + + f_image = change_shape(f_image, corp_x=corp_x, corp_y=corp_y, f_x=f_x, f_y=f_y) + + size = (f_image.size[0], f_image.size[1]) + if size not in size_type: + size_type.append(size) + + save_dir = os.path.join(root_target, str(seq + 1)) # Set save directory + name_dict[save_dir] = f_dir + + save_file(f_image, save_dir) + + print('old size type:', old_size_type) + print('size type: ', size_type) + + root_target, _ = os.path.split(root_positive) + pd.DataFrame.from_dict(name_dict, orient='index', columns=['origin path']).to_csv( + os.path.join(root_target, 'name_dict.csv')) + + +def trans_csv_folder_to_imagefoder(target_path=r'C:\Users\admin\Desktop\MRAS_SEED_dataset', + original_path=r'C:\Users\admin\Desktop\dataset\MARS_SEED_Dataset\train\train_org_image', + csv_path=r'C:\Users\admin\Desktop\dataset\MARS_SEED_Dataset\train\train_label.csv'): + """ + Original data format: a folder with image inside + a csv file with header which has the name and category of every image. + Process original dataset and get data packet in image folder format + + :param target_path: the path of target image folder + :param original_path: The folder with images + :param csv_path: A csv file with header and the name and category of each image + """ + idx = -1 + with open(csv_path, "rt", encoding="utf-8") as csvfile: + reader = csv.reader(csvfile) + rows = [row for row in reader] + make_and_clear_path(target_path) # Clear target_path + for row in tqdm(rows): + idx += 1 + if idx == 0: # Skip the first header + continue + item_path = os.path.join(original_path, row[0]) + if os.path.exists(os.path.join(target_path, row[1])): + shutil.copy(item_path, os.path.join(target_path, row[1])) + else: + os.makedirs(os.path.join(target_path, row[1])) + shutil.copy(item_path, os.path.join(target_path, row[1])) + + print('total num:', idx) + + +if __name__ == '__main__': + PC_to_stander(root_from=r'../Desktop/ROSE_2112', + root_positive=r'../Desktop/jpg_dataset/Positive', + root_negative=r'../Desktop/jpg_dataset/Negative', corp_x=5280, corp_y=3956, f_x=1390, + f_y=1038) diff --git a/PuzzleTuning/dataprocessing/deployment_dataset_INF.py b/PuzzleTuning/dataprocessing/deployment_dataset_INF.py new file mode 100644 index 0000000000000000000000000000000000000000..8174fec021b9d2b70bf647d6b02ec26bcd7f3de5 --- /dev/null +++ b/PuzzleTuning/dataprocessing/deployment_dataset_INF.py @@ -0,0 +1,286 @@ +""" +self supervise dataset AI-inferance Script ver: Aug 25th 22:00 + +""" +import argparse +import csv +import os +import shutil +import sys + +import cv2 +import numpy as np +import torch +import torch.nn as nn +from PIL import Image +from tqdm import tqdm + +sys.path.append("..") +from Backbone.getmodel import get_model +from utils.tools import find_all_files +from utils.data_augmentation import data_augmentation + + +def trans_csv_folder_to_imagefoder(target_path=r'C:\Users\admin\Desktop\MRAS_SEED_dataset', + original_path=r'C:\Users\admin\Desktop\dataset\MARS_SEED_Dataset\train\train_org_image', + csv_path=r'C:\Users\admin\Desktop\dataset\MARS_SEED_Dataset\train\train_label.csv'): + """ + Original data format: a folder with image inside + a csv file with header which has the name and category of every image. + Process original dataset and get data packet in image folder format + + :param target_path: the path of target image folder + :param original_path: The folder with images + :param csv_path: A csv file with header and the name and category of each image + """ + idx = -1 + with open(csv_path, "rt", encoding="utf-8") as csvfile: + reader = csv.reader(csvfile) + rows = [row for row in reader] + + if not os.path.exists(target_path): + os.makedirs(target_path) + + for row in tqdm(rows): + idx += 1 + + item_path = row[0] + if os.path.exists(os.path.join(target_path, row[1])): + shutil.copy(item_path, os.path.join(target_path, row[1])) + else: + os.makedirs(os.path.join(target_path, row[1])) + shutil.copy(item_path, os.path.join(target_path, row[1])) + + print('total num:', idx) + + +class PILImageTransform: + def __init__(self): + pass + + def __call__(self, image): + # Trans cv2 BGR image to PIL RGB image + b, g, r = cv2.split(image) + image = cv2.merge([r, g, b]) + return Image.fromarray(np.uint8(image)) + + +class Front_Background_Dataset(torch.utils.data.Dataset): + def __init__(self, input_root, data_transforms=None, edge_size=384, suffix='.jpg'): + + super().__init__() + + self.data_root = input_root + + # get files + self.input_ids = sorted(find_all_files(self.data_root, suffix=suffix)) + + # to PIL + self.PIL_Transform = PILImageTransform() + + # get data augmentation and transform + if data_transforms is not None: + self.transform = data_transforms + else: + self.transform = transforms.Compose([transforms.Resize(edge_size), transforms.ToTensor()]) + + def __len__(self): + return len(self.input_ids) + + def __getitem__(self, idx): + # get data path + imageName = self.input_ids[idx] + # get image id + imageID = imageName + # 文件名 os.path.split(imageName)[-1].split('.')[0] + + # get data + # CV2 0-255 hwc,in totensor step it will be transformed to chw. ps:PIL(0-1 hwc) + image = np.array(cv2.imread(imageName), dtype=np.float32) + + image = self.transform(self.PIL_Transform(image)) + + return image, imageID + + +def inferance(model, dataloader, record_dir, class_names=['0', '1'], result_csv_name='inferance.csv', device='cuda'): + if not os.path.exists(record_dir): + os.makedirs(record_dir) + + model.eval() + print('Inferance') + print('-' * 10) + + check_idx = 0 + + with open(os.path.join(record_dir, result_csv_name), 'w') as f_log: + # Iterate over data. + for images, imageIDs in dataloader: + images = images.to(device) + + # forward + outputs = model(images) + confidence, preds = torch.max(outputs, 1) + + pred_labels = preds.cpu().numpy() + + for output_idx in range(len(pred_labels)): + f_log.write(str(imageIDs[output_idx]) + ', ' + str(class_names[pred_labels[output_idx]]) + ', \n') + check_idx += 1 + + f_log.close() + print(str(check_idx) + ' samples are all recorded') + + +def main(args): + if args.paint: + # use Agg kernal, not painting in the front-desk + import matplotlib + matplotlib.use('Agg') + + # PATH + model_idx = args.model_idx + dataroot = args.dataroot + save_model_path = os.path.join(args.model_path, 'CLS_' + model_idx + '.pth') + record_dir = args.record_dir + if not os.path.exists(record_dir): + os.mkdir(record_dir) + + gpu_idx = args.gpu_idx + + drop_rate = args.drop_rate + attn_drop_rate = args.attn_drop_rate + drop_path_rate = args.drop_path_rate + use_cls_token = False if args.cls_token_off else True + use_pos_embedding = False if args.pos_embedding_off else True + use_att_module = None if args.att_module == 'None' else args.att_module + edge_size = args.edge_size + batch_size = args.batch_size + + data_transforms = data_augmentation(data_augmentation_mode=args.data_augmentation_mode, edge_size=edge_size) + + inf_dataset = Front_Background_Dataset(dataroot, data_transforms=data_transforms['val'], edge_size=edge_size, + suffix='.jpg') + dataloader = torch.utils.data.DataLoader(inf_dataset, batch_size=batch_size, num_workers=2, shuffle=False) + + class_names = ['0', '1'] # 0 for empty + + # Get model + pretrained_backbone = False + if args.num_classes == 0: + print("class_names:", class_names) + num_classes = len(class_names) + else: + if len(class_names) == args.num_classes: + print("class_names:", class_names) + else: + print('classfication number of the model mismatch the dataset requirement of:', len(class_names)) + return -1 + + model = get_model(num_classes, edge_size, model_idx, drop_rate, attn_drop_rate, drop_path_rate, + pretrained_backbone, use_cls_token, use_pos_embedding, use_att_module) + + # todo: this model structure is formed under only one condition + if gpu_idx == -1: + if torch.cuda.device_count() > 1: + print("Use", torch.cuda.device_count(), "GPUs!") + # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs + model = nn.DataParallel(model) + else: + print('we dont have more GPU idx here, try to use gpu_idx=0') + try: + # setting 0 for: only card idx 0 is sighted for this code + os.environ['CUDA_VISIBLE_DEVICES'] = '0' + except: + print("GPU distributing ERRO occur use CPU instead") + + else: + # Decide which device we want to run on + try: + # setting k for: only card idx k is sighted for this code + os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_idx) + except: + print('we dont have that GPU idx here, try to use gpu_idx=0') + try: + # setting 0 for: only card idx 0 is sighted for this code + os.environ['CUDA_VISIBLE_DEVICES'] = '0' + except: + print("GPU distributing ERRO occur use CPU instead") + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # single card for test + + try: + model.load_state_dict(torch.load(save_model_path), False) + except: + print('model loading erro') + else: + print('model loaded') + + model.to(device) + + inferance(model, dataloader, record_dir, class_names=class_names, result_csv_name='inferance.csv', device='cuda') + + +def get_args_parser(): + parser = argparse.ArgumentParser(description='PyTorch ImageNet INF') + + # Model Name or index + parser.add_argument('--model_idx', default='Hybrid2_384_401_testsample', type=str, help='Model Name or index') + + # MIL Stripe + parser.add_argument('--MIL_Stripe', action='store_true', help='MIL_Stripe') + + # drop_rate, attn_drop_rate, drop_path_rate + parser.add_argument('--drop_rate', default=0.0, type=float, help='dropout rate , default 0.0') + parser.add_argument('--attn_drop_rate', default=0.0, type=float, help='dropout rate Aftter Attention, default 0.0') + parser.add_argument('--drop_path_rate', default=0.0, type=float, help='drop path for stochastic depth, default 0.0') + + # Abalation Studies for MSHT + parser.add_argument('--cls_token_off', action='store_true', help='use cls_token in model structure') + parser.add_argument('--pos_embedding_off', action='store_true', help='use pos_embedding in model structure') + # 'SimAM', 'CBAM', 'SE' 'None' + parser.add_argument('--att_module', default='SimAM', type=str, help='use which att_module in model structure') + + # Enviroment parameters + parser.add_argument('--gpu_idx', default=0, type=int, + help='use a single GPU with its index, -1 to use multiple GPU') + + # Path parameters + parser.add_argument('--dataroot', default=r'/data/pancreatic-cancer-project/k5_dataset', + help='path to dataset') + parser.add_argument('--model_path', default=r'/home/pancreatic-cancer-project/saved_models', + help='path to save model state-dict') + parser.add_argument('--record_dir', default=r'/home/pancreatic-cancer-project/INF', + help='path to record INF csv') + + # Help tool parameters + parser.add_argument('--paint', action='store_false', help='paint in front desk') # matplotlib.use('Agg') + parser.add_argument('--enable_notify', action='store_true', help='enable notify to send email') + # check tool parameters + parser.add_argument('--enable_tensorboard', action='store_true', help='enable tensorboard to save status') + + parser.add_argument('--enable_attention_check', action='store_true', help='check and save attention map') + parser.add_argument('--enable_visualize_check', action='store_true', help='check and save pics') + + parser.add_argument('--data_augmentation_mode', default=0, type=int, help='data_augmentation_mode') + + # PromptTuning + parser.add_argument('--PromptTuning', default=None, type=str, + help='use Prompt Tuning strategy instead of Finetuning') + # Prompt_Token_num + parser.add_argument('--Prompt_Token_num', default=10, type=int, help='Prompt_Token_num') + + # Dataset based parameters + parser.add_argument('--num_classes', default=0, type=int, help='classification number, default 0 for auto-fit') + parser.add_argument('--edge_size', default=384, type=int, help='edge size of input image') # 224 256 384 1000 + + # Test setting parameters + parser.add_argument('--batch_size', default=1, type=int, help='testing batch_size default 1') + + return parser + + +if __name__ == '__main__': + parser = get_args_parser() + args = parser.parse_args() + main(args) + + # 转换生成的csv保存到哪? diff --git a/PuzzleTuning/dataprocessing/non-instance.py b/PuzzleTuning/dataprocessing/non-instance.py new file mode 100644 index 0000000000000000000000000000000000000000..0dcc0912dd36f1a1a7784b1168b5cf06711fd19c --- /dev/null +++ b/PuzzleTuning/dataprocessing/non-instance.py @@ -0,0 +1,84 @@ +""" +对测试集进行处理,将图片中的实例遮住,实例部分用原图像素均值填充,生成新的测试集 +ver: Feb 21th +""" + +import numpy as np +import cv2 +import os + + +def find_all_files(root, suffix=None): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + if type(suffix) is str or suffix is None: + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + return res + + else: + print('type of suffix is not legal :', type(suffix)) + return -1 + + +if __name__ == '__main__': + # 只需要修改数据路径和result路径,new_test与test平级 + # 导入测试集image和mask + data_path = 'E:/Study/code/datasets/SIPaKMeD_MIL/test/data/' # MIL数据集的路径 + result_path = 'E:/Study/code/datasets/SIPaKMeD_MIL/new_test/' + if not os.path.exists(result_path): + os.makedirs(result_path) + + suffix = '.jpg' + # 获取类别名,制作label和类别名的对应字典 + class_names = [filename for filename in os.listdir(data_path) + if os.path.isdir(os.path.join(data_path, filename))] + class_names.sort() + cls_idxs = [i for i in range(len(class_names))] + class_id_dict = dict(zip(class_names, cls_idxs)) + input_ids = sorted(find_all_files(data_path, suffix=suffix)) + + # 制作结果路径 + for class_name in class_names: + res_path = result_path + class_name + if not os.path.exists(res_path): + os.makedirs(res_path) + + for i in range(len(input_ids)): + image_path = input_ids[i] + # 读取image和mask + # CV2 0-255 hwc,in totensor step it will be transformed to chw. ps:PIL(0-1 hwc) + image = np.array(cv2.imread(image_path)) + + # mask_path is replace the last 'data' by 'mask' + mask_path = "data".join(image_path.split("data")[:-1]) + 'mask' + "".join(image_path.split("data")[-1:]) + # mask: 0/255 cv2 hwc + mask = np.array(cv2.imread(mask_path)) + mask_norm = np.where(mask > 50, 0, 1) + # new_image_path is replace the last 'test/data' by 'new_test' + new_image_path = "data".join(image_path.split("test/data")[:-1]) + \ + 'new_test' + "".join(image_path.split("test/data")[-1:]) + new_image = image * mask_norm + + # 把抠掉的部分填充成原图的像素均值 + value_mean_r = int(np.mean(image[:, :, 0])) + value_mean_g = int(np.mean(image[:, :, 1])) + value_mean_b = int(np.mean(image[:, :, 2])) + new_image[:, :, 0][new_image[:, :, 0] == 0] = value_mean_r + new_image[:, :, 1][new_image[:, :, 1] == 0] = value_mean_g + new_image[:, :, 2][new_image[:, :, 2] == 0] = value_mean_b + new_image = new_image.astype(np.uint8) + + # # 显示原图,mask,new_image + # images = np.hstack([image, mask, new_image]) + # cv2.imshow('Before and after mask', images) + # cv2.waitKey(0) + + # 存储新的图片 + cv2.imwrite(new_image_path, new_image) + diff --git a/PuzzleTuning/dataprocessing/resize_and_crop.py b/PuzzleTuning/dataprocessing/resize_and_crop.py new file mode 100644 index 0000000000000000000000000000000000000000..27b5970c6b17b5c30fa82e68ef06cb72db710da4 --- /dev/null +++ b/PuzzleTuning/dataprocessing/resize_and_crop.py @@ -0,0 +1,32 @@ +from PIL import Image +import os + + +def resize_and_crop(source_folder, target_folder, width, height,endswith='.jpg'): + if not os.path.exists(target_folder): + os.makedirs(target_folder) + + for filename in os.listdir(source_folder): + if filename.endswith(endswith): # or filename.endswith(".jpg"): if some images are .jpg + image_path = os.path.join(source_folder, filename) + image = Image.open(image_path) + + # Crop the largest centered square + w, h = image.size + min_dim = min(w, h) + left = (w - min_dim) / 2 + top = (h - min_dim) / 2 + right = (w + min_dim) / 2 + bottom = (h + min_dim) / 2 + image_cropped = image.crop((left, top, right, bottom)) + + # Resize the cropped image + image_resized = image_cropped.resize((width, height)) + target_path = os.path.join(target_folder, filename) + image_resized.save(target_path) + + +source_directory = './CAM16' # Replace this with the path to your folder with original images +target_directory = './CAM16_new' # Replace this with the path where you want to save resized images + +resize_and_crop(source_directory, target_directory, width=224, height=224, endswith='.jpg') diff --git a/PuzzleTuning/dataprocessing/self_supervise_dataset_generator.py b/PuzzleTuning/dataprocessing/self_supervise_dataset_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..857f2fb04cb1bd495336c6be76a6f37b5edf9cdb --- /dev/null +++ b/PuzzleTuning/dataprocessing/self_supervise_dataset_generator.py @@ -0,0 +1,125 @@ +""" +self supervise dataset making Script ver: Aug 21th 21:50 + +todo +提供了一个简单的思路 +先做成单进程的,后续做成多进程的 + + +""" +import torch +import numpy as np +import os +import shutil + +from utils.tools import to_2tuple, find_all_files + + +def convert_to_npy(a_data_path): + + # 处理转换 + + # 传回npy + numpy_img = 0 + + return numpy_img + + +def cut_to_patch(numpy_img, save_root, resize_infor, patch_size=384): + pass + + +def read_and_convert(data_root,save_root, resize_infor, suffix=None, patch_size=384): + # 一次处理只一个数据集, 每个数据集的处理方式可能有不同 + + # 读入所有数据 + all_files = find_all_files(data_root) + + # 把所有数据转换为同一个格式 + for img in all_files: + numpy_img = convert_to_npy(img) + cut_to_patch(numpy_img, save_root, resize_infor, patch_size) + + pass + + +class to_patch: + """ + Split a image into patches, each patch with the size of patch_size + """ + + def __init__(self, patch_size=(16, 16)): + patch_size = to_2tuple(patch_size) + self.patch_h = patch_size[0] + self.patch_w = patch_size[1] + + def __call__(self, x): + c, h, w = x.shape + + assert h // self.patch_h == h / self.patch_h and w // self.patch_w == w / self.patch_w + + num_patches = (h // self.patch_h) * (w // self.patch_w) + + # patch encoding + # (c, h, w) + # -> (c, h // self.patch_h, self.patch_h, w // self.patch_w, self.patch_w) + # -> (h // self.patch_h, w // self.patch_w, self.patch_h, self.patch_w, c) + # -> (n_patches, patch_size^2*c) + patches = x.view( + c, + h // self.patch_h, + self.patch_h, + w // self.patch_w, + self.patch_w).permute(1, 3, 2, 4, 0).reshape(num_patches, -1) # it can also used in transformer Encoding + + # patch split + # (n_patches, patch_size^2*c) + # -> (num_patches, self.patch_h, self.patch_w, c) + # -> (num_patches, c, self.patch_h, self.patch_w) + patches = patches.view(num_patches, + self.patch_h, + self.patch_w, + c).permute(0, 3, 1, 2) + + ''' + # check + for i in range(len(patches)): + recons_img = ToPILImage()(patches[i]) + recons_img.save(os.path.join('./patch_play', 'recons_target'+str(i)+'.jpg')) + + + # patch compose to image + # (num_patches, c, self.patch_h, self.patch_w) + # -> (h // self.patch_h, w // self.patch_w, c, self.patch_h, self.patch_w) + # -> (c, h // self.patch_h, self.patch_h, w // self.patch_w, self.patch_w) + # -> (c, h, w) + patches = patches.view(h // self.patch_h, + w // self.patch_w, + c, + self.patch_h, + self.patch_w).permute(2, 0, 3, 1, 4).reshape(c, h, w) + ''' + + ''' + # visual check + # reshape + composed_patches = patches.view(h // self.patch_h, + w // self.patch_w, + c, + self.patch_h, + self.patch_w).permute(2, 0, 3, 1, 4).reshape(c, h, w) + # view pic + from torchvision.transforms import ToPILImage + composed_img = ToPILImage()(bag_image[0]) # transform tensor image to PIL image + composed_img.save(os.path.join('./', 'composed_img.jpg')) + + ''' + + return patches + + +img = np.ones([3, 224, 224]) + +patchfy=to_patch(patch_size=(16, 16)) + +patch=patchfy(img) \ No newline at end of file diff --git a/PuzzleTuning/pytorch_grad_cam/README.md b/PuzzleTuning/pytorch_grad_cam/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3d0b6e60f1c9888f3f6fbe2d0faf8e43f115a0da --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/README.md @@ -0,0 +1,27 @@ +from https://github.com/jacobgil/pytorch-grad-cam + +# References + +https://arxiv.org/abs/1610.02391 +Grad-CAM: Visual Explanations from Deep Networks via Gradient-based Localization Ramprasaath R. Selvaraju, Michael Cogswell, Abhishek Das, Ramakrishna Vedantam, Devi Parikh, Dhruv Batra + +https://arxiv.org/abs/1710.11063 +Grad-CAM++: Improved Visual Explanations for Deep Convolutional Networks Aditya Chattopadhyay, Anirban Sarkar, Prantik Howlader, Vineeth N Balasubramanian + +https://arxiv.org/abs/1910.01279 +Score-CAM: Score-Weighted Visual Explanations for Convolutional Neural Networks Haofan Wang, Zifan Wang, Mengnan Du, Fan Yang, Zijian Zhang, Sirui Ding, Piotr Mardziel, Xia Hu + +https://ieeexplore.ieee.org/abstract/document/9093360/ +Ablation-cam: Visual explanations for deep convolutional network via gradient-free localization. Saurabh Desai and Harish G Ramaswamy. In WACV, pages 972–980, 2020 + +https://arxiv.org/abs/2008.02312 +Axiom-based Grad-CAM: Towards Accurate Visualization and Explanation of CNNs Ruigang Fu, Qingyong Hu, Xiaohu Dong, Yulan Guo, Yinghui Gao, Biao Li + +https://arxiv.org/abs/2008.00299 +Eigen-CAM: Class Activation Map using Principal Components Mohammed Bany Muhammad, Mohammed Yeasin + +http://mftp.mmcheng.net/Papers/21TIP_LayerCAM.pdf +LayerCAM: Exploring Hierarchical Class Activation Maps for Localization Peng-Tao Jiang; Chang-Bin Zhang; Qibin Hou; Ming-Ming Cheng; Yunchao Wei + +https://arxiv.org/abs/1905.00780 +Full-Gradient Representation for Neural Network Visualization Suraj Srinivas, Francois Fleuret diff --git a/PuzzleTuning/pytorch_grad_cam/__init__.py b/PuzzleTuning/pytorch_grad_cam/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a6d7c10790dda00580069776f65a74f3bc8d0768 --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/__init__.py @@ -0,0 +1,10 @@ +from pytorch_grad_cam.grad_cam import GradCAM +from pytorch_grad_cam.ablation_cam import AblationCAM +from pytorch_grad_cam.xgrad_cam import XGradCAM +from pytorch_grad_cam.grad_cam_plusplus import GradCAMPlusPlus +from pytorch_grad_cam.score_cam import ScoreCAM +from pytorch_grad_cam.layer_cam import LayerCAM +from pytorch_grad_cam.eigen_cam import EigenCAM +from pytorch_grad_cam.eigen_grad_cam import EigenGradCAM +from pytorch_grad_cam.fullgrad_cam import FullGrad +from pytorch_grad_cam.guided_backprop import GuidedBackpropReLUModel diff --git a/PuzzleTuning/pytorch_grad_cam/ablation_cam.py b/PuzzleTuning/pytorch_grad_cam/ablation_cam.py new file mode 100644 index 0000000000000000000000000000000000000000..7bb682e6e44d8a9f59d5f28e5b1461a67836eb63 --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/ablation_cam.py @@ -0,0 +1,105 @@ +import numpy as np +import torch +import tqdm +from pytorch_grad_cam.base_cam import BaseCAM +from pytorch_grad_cam.utils.find_layers import replace_layer_recursive + + +class AblationLayer(torch.nn.Module): + def __init__(self, layer, reshape_transform, indices): + super(AblationLayer, self).__init__() + + self.layer = layer + self.reshape_transform = reshape_transform + # The channels to zero out: + self.indices = indices + + def forward(self, x): + self.__call__(x) + + def __call__(self, x): + output = self.layer(x) + + # Hack to work with ViT, + # Since the activation channels are last and not first like in CNNs + # Probably should remove it? + if self.reshape_transform is not None: + output = output.transpose(1, 2) + + for i in range(output.size(0)): + + # Commonly the minimum activation will be 0, + # And then it makes sense to zero it out. + # However depending on the architecture, + # If the values can be negative, we use very negative values + # to perform the ablation, deviating from the paper. + if torch.min(output) == 0: + output[i, self.indices[i], :] = 0 + else: + ABLATION_VALUE = 1e5 + output[i, self.indices[i], :] = torch.min( + output) - ABLATION_VALUE + + if self.reshape_transform is not None: + output = output.transpose(2, 1) + + return output + + +class AblationCAM(BaseCAM): + def __init__(self, + model, + target_layers, + use_cuda=False, + reshape_transform=None): + super(AblationCAM, self).__init__(model, target_layers, use_cuda, + reshape_transform) + + def get_cam_weights(self, + input_tensor, + target_layer, + target_category, + activations, + grads): + with torch.no_grad(): + outputs = self.model(input_tensor).cpu().numpy() + original_scores = [] + for i in range(input_tensor.size(0)): + original_scores.append(outputs[i, target_category[i]]) + original_scores = np.float32(original_scores) + + ablation_layer = AblationLayer(target_layer, + self.reshape_transform, + indices=[]) + replace_layer_recursive(self.model, target_layer, ablation_layer) + + if hasattr(self, "batch_size"): + BATCH_SIZE = self.batch_size + else: + BATCH_SIZE = 32 + + number_of_channels = activations.shape[1] + weights = [] + + with torch.no_grad(): + # Iterate over the input batch + for tensor, category in zip(input_tensor, target_category): + batch_tensor = tensor.repeat(BATCH_SIZE, 1, 1, 1) + for i in tqdm.tqdm(range(0, number_of_channels, BATCH_SIZE)): + ablation_layer.indices = list(range(i, i + BATCH_SIZE)) + + if i + BATCH_SIZE > number_of_channels: + keep = number_of_channels - i + batch_tensor = batch_tensor[:keep] + ablation_layer.indices = ablation_layer.indices[:keep] + score = self.model(batch_tensor)[:, category].cpu().numpy() + weights.extend(score) + + weights = np.float32(weights) + weights = weights.reshape(activations.shape[:2]) + original_scores = original_scores[:, None] + weights = (original_scores - weights) / original_scores + + # Replace the model back to the original state + replace_layer_recursive(self.model, ablation_layer, target_layer) + return weights diff --git a/PuzzleTuning/pytorch_grad_cam/activations_and_gradients.py b/PuzzleTuning/pytorch_grad_cam/activations_and_gradients.py new file mode 100644 index 0000000000000000000000000000000000000000..e311c594aff00adc5c7489aeb476cc2544a5075c --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/activations_and_gradients.py @@ -0,0 +1,45 @@ +class ActivationsAndGradients: + """ Class for extracting activations and + registering gradients from targetted intermediate layers """ + + def __init__(self, model, target_layers, reshape_transform): + self.model = model + self.gradients = [] + self.activations = [] + self.reshape_transform = reshape_transform + self.handles = [] + for target_layer in target_layers: + self.handles.append( + target_layer.register_forward_hook( + self.save_activation)) + # Backward compitability with older pytorch versions: + if hasattr(target_layer, 'register_full_backward_hook'): + self.handles.append( + target_layer.register_full_backward_hook( + self.save_gradient)) + else: + self.handles.append( + target_layer.register_backward_hook( + self.save_gradient)) + + def save_activation(self, module, input, output): + activation = output + if self.reshape_transform is not None: + activation = self.reshape_transform(activation) + self.activations.append(activation.cpu().detach()) + + def save_gradient(self, module, grad_input, grad_output): + # Gradients are computed in reverse order + grad = grad_output[0] + if self.reshape_transform is not None: + grad = self.reshape_transform(grad) + self.gradients = [grad.cpu().detach()] + self.gradients + + def __call__(self, x): + self.gradients = [] + self.activations = [] + return self.model(x) + + def release(self): + for handle in self.handles: + handle.remove() diff --git a/PuzzleTuning/pytorch_grad_cam/base_cam.py b/PuzzleTuning/pytorch_grad_cam/base_cam.py new file mode 100644 index 0000000000000000000000000000000000000000..b3a1dbc6dedd0b1e06afd9c58aaba2d72613f00b --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/base_cam.py @@ -0,0 +1,202 @@ +import cv2 +import numpy as np +import torch +import ttach as tta +from pytorch_grad_cam.activations_and_gradients import ActivationsAndGradients +from pytorch_grad_cam.utils.svd_on_activations import get_2d_projection + + +class BaseCAM: + def __init__(self, + model, + target_layers, + use_cuda=False, + reshape_transform=None, + compute_input_gradient=False, + uses_gradients=True): + self.model = model.eval() + self.target_layers = target_layers + self.cuda = use_cuda + if self.cuda: + self.model = model.cuda() + self.reshape_transform = reshape_transform + self.compute_input_gradient = compute_input_gradient + self.uses_gradients = uses_gradients + self.activations_and_grads = ActivationsAndGradients( + self.model, target_layers, reshape_transform) + + """ Get a vector of weights for every channel in the target layer. + Methods that return weights channels, + will typically need to only implement this function. """ + + def get_cam_weights(self, + input_tensor, + target_layers, + target_category, + activations, + grads): + raise Exception("Not Implemented") + + def get_loss(self, output, target_category): + loss = 0 + for i in range(len(target_category)): + loss = loss + output[i, target_category[i]] + return loss + + def get_cam_image(self, + input_tensor, + target_layer, + target_category, + activations, + grads, + eigen_smooth=False): + weights = self.get_cam_weights(input_tensor, target_layer, + target_category, activations, grads) + weighted_activations = weights[:, :, None, None] * activations + if eigen_smooth: + cam = get_2d_projection(weighted_activations) + else: + cam = weighted_activations.sum(axis=1) + return cam + + def forward(self, input_tensor, target_category=None, eigen_smooth=False): + if self.cuda: + input_tensor = input_tensor.cuda() + + if self.compute_input_gradient: + input_tensor = torch.autograd.Variable(input_tensor, + requires_grad=True) + + output = self.activations_and_grads(input_tensor) + if isinstance(target_category, int): + target_category = [target_category] * input_tensor.size(0) + + if target_category is None: + target_category = np.argmax(output.cpu().data.numpy(), axis=-1) + else: + assert(len(target_category) == input_tensor.size(0)) + + if self.uses_gradients: + self.model.zero_grad() + loss = self.get_loss(output, target_category) + loss.backward(retain_graph=True) + + # In most of the saliency attribution papers, the saliency is + # computed with a single target layer. + # Commonly it is the last convolutional layer. + # Here we support passing a list with multiple target layers. + # It will compute the saliency image for every image, + # and then aggregate them (with a default mean aggregation). + # This gives you more flexibility in case you just want to + # use all conv layers for example, all Batchnorm layers, + # or something else. + cam_per_layer = self.compute_cam_per_layer(input_tensor, + target_category, + eigen_smooth) + return self.aggregate_multi_layers(cam_per_layer) + + def get_target_width_height(self, input_tensor): + width, height = input_tensor.size(-1), input_tensor.size(-2) + return width, height + + def compute_cam_per_layer( + self, + input_tensor, + target_category, + eigen_smooth): + activations_list = [a.cpu().data.numpy() + for a in self.activations_and_grads.activations] + grads_list = [g.cpu().data.numpy() + for g in self.activations_and_grads.gradients] + target_size = self.get_target_width_height(input_tensor) + + cam_per_target_layer = [] + # Loop over the saliency image from every layer + + for target_layer, layer_activations, layer_grads in \ + zip(self.target_layers, activations_list, grads_list): + cam = self.get_cam_image(input_tensor, + target_layer, + target_category, + layer_activations, + layer_grads, + eigen_smooth) + scaled = self.scale_cam_image(cam, target_size) + cam_per_target_layer.append(scaled[:, None, :]) + + return cam_per_target_layer + + def aggregate_multi_layers(self, cam_per_target_layer): + cam_per_target_layer = np.concatenate(cam_per_target_layer, axis=1) + cam_per_target_layer = np.maximum(cam_per_target_layer, 0) + result = np.mean(cam_per_target_layer, axis=1) + return self.scale_cam_image(result) + + def scale_cam_image(self, cam, target_size=None): + result = [] + for img in cam: + img = img - np.min(img) + img = img / (1e-7 + np.max(img)) + if target_size is not None: + img = cv2.resize(img, target_size) + result.append(img) + result = np.float32(result) + + return result + + def forward_augmentation_smoothing(self, + input_tensor, + target_category=None, + eigen_smooth=False): + transforms = tta.Compose( + [ + tta.HorizontalFlip(), + tta.Multiply(factors=[0.9, 1, 1.1]), + ] + ) + cams = [] + for transform in transforms: + augmented_tensor = transform.augment_image(input_tensor) + cam = self.forward(augmented_tensor, + target_category, eigen_smooth) + + # The ttach library expects a tensor of size BxCxHxW + cam = cam[:, None, :, :] + cam = torch.from_numpy(cam) + cam = transform.deaugment_mask(cam) + + # Back to numpy float32, HxW + cam = cam.numpy() + cam = cam[:, 0, :, :] + cams.append(cam) + + cam = np.mean(np.float32(cams), axis=0) + return cam + + def __call__(self, + input_tensor, + target_category=None, + aug_smooth=False, + eigen_smooth=False): + + # Smooth the CAM result with test time augmentation + if aug_smooth is True: + return self.forward_augmentation_smoothing( + input_tensor, target_category, eigen_smooth) + + return self.forward(input_tensor, + target_category, eigen_smooth) + + def __del__(self): + self.activations_and_grads.release() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, exc_tb): + self.activations_and_grads.release() + if isinstance(exc_value, IndexError): + # Handle IndexError here... + print( + f"An exception occurred in CAM with block: {exc_type}. Message: {exc_value}") + return True diff --git a/PuzzleTuning/pytorch_grad_cam/eigen_cam.py b/PuzzleTuning/pytorch_grad_cam/eigen_cam.py new file mode 100644 index 0000000000000000000000000000000000000000..89563748d14672ff026d21f134c2d234659523b5 --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/eigen_cam.py @@ -0,0 +1,20 @@ +from pytorch_grad_cam.base_cam import BaseCAM +from pytorch_grad_cam.utils.svd_on_activations import get_2d_projection + +# https://arxiv.org/abs/2008.00299 + + +class EigenCAM(BaseCAM): + def __init__(self, model, target_layers, use_cuda=False, + reshape_transform=None): + super(EigenCAM, self).__init__(model, target_layers, use_cuda, + reshape_transform) + + def get_cam_image(self, + input_tensor, + target_layer, + target_category, + activations, + grads, + eigen_smooth): + return get_2d_projection(activations) diff --git a/PuzzleTuning/pytorch_grad_cam/eigen_grad_cam.py b/PuzzleTuning/pytorch_grad_cam/eigen_grad_cam.py new file mode 100644 index 0000000000000000000000000000000000000000..3932a96d27b6019ed0f537688f0beb47d3c57e11 --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/eigen_grad_cam.py @@ -0,0 +1,21 @@ +from pytorch_grad_cam.base_cam import BaseCAM +from pytorch_grad_cam.utils.svd_on_activations import get_2d_projection + +# Like Eigen CAM: https://arxiv.org/abs/2008.00299 +# But multiply the activations x gradients + + +class EigenGradCAM(BaseCAM): + def __init__(self, model, target_layers, use_cuda=False, + reshape_transform=None): + super(EigenGradCAM, self).__init__(model, target_layers, use_cuda, + reshape_transform) + + def get_cam_image(self, + input_tensor, + target_layer, + target_category, + activations, + grads, + eigen_smooth): + return get_2d_projection(grads * activations) diff --git a/PuzzleTuning/pytorch_grad_cam/fullgrad_cam.py b/PuzzleTuning/pytorch_grad_cam/fullgrad_cam.py new file mode 100644 index 0000000000000000000000000000000000000000..3cf4bf394beb1f0f5780831a78da1deca738f1ba --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/fullgrad_cam.py @@ -0,0 +1,106 @@ +import numpy as np +import torch +from pytorch_grad_cam.base_cam import BaseCAM +from pytorch_grad_cam.utils.find_layers import find_layer_predicate_recursive +from pytorch_grad_cam.utils.svd_on_activations import get_2d_projection + +# https://arxiv.org/abs/1905.00780 + + +class FullGrad(BaseCAM): + def __init__(self, model, target_layers, use_cuda=False, + reshape_transform=None): + if len(target_layers) > 0: + print( + "Warning: target_layers is ignored in FullGrad. All bias layers will be used instead") + + def layer_with_2D_bias(layer): + bias_target_layers = [torch.nn.Conv2d, torch.nn.BatchNorm2d] + if type(layer) in bias_target_layers and layer.bias is not None: + return True + return False + target_layers = find_layer_predicate_recursive( + model, layer_with_2D_bias) + super( + FullGrad, + self).__init__( + model, + target_layers, + use_cuda, + reshape_transform, + compute_input_gradient=True) + self.bias_data = [self.get_bias_data( + layer).cpu().numpy() for layer in target_layers] + + def get_bias_data(self, layer): + # Borrowed from official paper impl: + # https://github.com/idiap/fullgrad-saliency/blob/master/saliency/tensor_extractor.py#L47 + if isinstance(layer, torch.nn.BatchNorm2d): + bias = - (layer.running_mean * layer.weight + / torch.sqrt(layer.running_var + layer.eps)) + layer.bias + return bias.data + else: + return layer.bias.data + + def scale_accross_batch_and_channels(self, tensor, target_size): + batch_size, channel_size = tensor.shape[:2] + reshaped_tensor = tensor.reshape( + batch_size * channel_size, *tensor.shape[2:]) + result = self.scale_cam_image(reshaped_tensor, target_size) + result = result.reshape( + batch_size, + channel_size, + target_size[1], + target_size[0]) + return result + + def compute_cam_per_layer( + self, + input_tensor, + target_category, + eigen_smooth): + input_grad = input_tensor.grad.data.cpu().numpy() + grads_list = [g.cpu().data.numpy() for g in + self.activations_and_grads.gradients] + cam_per_target_layer = [] + target_size = self.get_target_width_height(input_tensor) + + gradient_multiplied_input = input_grad * input_tensor.data.cpu().numpy() + gradient_multiplied_input = np.abs(gradient_multiplied_input) + gradient_multiplied_input = self.scale_accross_batch_and_channels( + gradient_multiplied_input, + target_size) + cam_per_target_layer.append(gradient_multiplied_input) + + # Loop over the saliency image from every layer + assert(len(self.bias_data) == len(grads_list)) + for bias, grads in zip(self.bias_data, grads_list): + bias = bias[None, :, None, None] + # In the paper they take the absolute value, + # but possibily taking only the positive gradients will work + # better. + bias_grad = np.abs(bias * grads) + result = self.scale_accross_batch_and_channels( + bias_grad, target_size) + result = np.sum(result, axis=1) + cam_per_target_layer.append(result[:, None, :]) + cam_per_target_layer = np.concatenate(cam_per_target_layer, axis=1) + if eigen_smooth: + # Resize to a smaller image, since this method typically has a very large number of channels, + # and then consumes a lot of memory + cam_per_target_layer = self.scale_accross_batch_and_channels( + cam_per_target_layer, (target_size[0] // 8, target_size[1] // 8)) + cam_per_target_layer = get_2d_projection(cam_per_target_layer) + cam_per_target_layer = cam_per_target_layer[:, None, :, :] + cam_per_target_layer = self.scale_accross_batch_and_channels( + cam_per_target_layer, + target_size) + else: + cam_per_target_layer = np.sum( + cam_per_target_layer, axis=1)[:, None, :] + + return cam_per_target_layer + + def aggregate_multi_layers(self, cam_per_target_layer): + result = np.sum(cam_per_target_layer, axis=1) + return self.scale_cam_image(result) diff --git a/PuzzleTuning/pytorch_grad_cam/grad_cam.py b/PuzzleTuning/pytorch_grad_cam/grad_cam.py new file mode 100644 index 0000000000000000000000000000000000000000..025bf45ddc57ce3105945d7f4a747d001618a428 --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/grad_cam.py @@ -0,0 +1,22 @@ +import numpy as np +from pytorch_grad_cam.base_cam import BaseCAM + + +class GradCAM(BaseCAM): + def __init__(self, model, target_layers, use_cuda=False, + reshape_transform=None): + super( + GradCAM, + self).__init__( + model, + target_layers, + use_cuda, + reshape_transform) + + def get_cam_weights(self, + input_tensor, + target_layer, + target_category, + activations, + grads): + return np.mean(grads, axis=(2, 3)) diff --git a/PuzzleTuning/pytorch_grad_cam/grad_cam_plusplus.py b/PuzzleTuning/pytorch_grad_cam/grad_cam_plusplus.py new file mode 100644 index 0000000000000000000000000000000000000000..4466826b7dd8707063885a1742332492213b03dd --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/grad_cam_plusplus.py @@ -0,0 +1,32 @@ +import numpy as np +from pytorch_grad_cam.base_cam import BaseCAM + +# https://arxiv.org/abs/1710.11063 + + +class GradCAMPlusPlus(BaseCAM): + def __init__(self, model, target_layers, use_cuda=False, + reshape_transform=None): + super(GradCAMPlusPlus, self).__init__(model, target_layers, use_cuda, + reshape_transform) + + def get_cam_weights(self, + input_tensor, + target_layers, + target_category, + activations, + grads): + grads_power_2 = grads**2 + grads_power_3 = grads_power_2 * grads + # Equation 19 in https://arxiv.org/abs/1710.11063 + sum_activations = np.sum(activations, axis=(2, 3)) + eps = 0.000001 + aij = grads_power_2 / (2 * grads_power_2 + + sum_activations[:, :, None, None] * grads_power_3 + eps) + # Now bring back the ReLU from eq.7 in the paper, + # And zero out aijs where the activations are 0 + aij = np.where(grads != 0, aij, 0) + + weights = np.maximum(grads, 0) * aij + weights = np.sum(weights, axis=(2, 3)) + return weights diff --git a/PuzzleTuning/pytorch_grad_cam/guided_backprop.py b/PuzzleTuning/pytorch_grad_cam/guided_backprop.py new file mode 100644 index 0000000000000000000000000000000000000000..602fbf354397bf8596f700e8dce94dd0b7f49011 --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/guided_backprop.py @@ -0,0 +1,100 @@ +import numpy as np +import torch +from torch.autograd import Function +from pytorch_grad_cam.utils.find_layers import replace_all_layer_type_recursive + + +class GuidedBackpropReLU(Function): + @staticmethod + def forward(self, input_img): + positive_mask = (input_img > 0).type_as(input_img) + output = torch.addcmul( + torch.zeros( + input_img.size()).type_as(input_img), + input_img, + positive_mask) + self.save_for_backward(input_img, output) + return output + + @staticmethod + def backward(self, grad_output): + input_img, output = self.saved_tensors + grad_input = None + + positive_mask_1 = (input_img > 0).type_as(grad_output) + positive_mask_2 = (grad_output > 0).type_as(grad_output) + grad_input = torch.addcmul( + torch.zeros( + input_img.size()).type_as(input_img), + torch.addcmul( + torch.zeros( + input_img.size()).type_as(input_img), + grad_output, + positive_mask_1), + positive_mask_2) + return grad_input + + +class GuidedBackpropReLUasModule(torch.nn.Module): + def __init__(self): + super(GuidedBackpropReLUasModule, self).__init__() + + def forward(self, input_img): + return GuidedBackpropReLU.apply(input_img) + + +class GuidedBackpropReLUModel: + def __init__(self, model, use_cuda): + self.model = model + self.model.eval() + self.cuda = use_cuda + if self.cuda: + self.model = self.model.cuda() + + def forward(self, input_img): + return self.model(input_img) + + def recursive_replace_relu_with_guidedrelu(self, module_top): + + for idx, module in module_top._modules.items(): + self.recursive_replace_relu_with_guidedrelu(module) + if module.__class__.__name__ == 'ReLU': + module_top._modules[idx] = GuidedBackpropReLU.apply + print("b") + + def recursive_replace_guidedrelu_with_relu(self, module_top): + try: + for idx, module in module_top._modules.items(): + self.recursive_replace_guidedrelu_with_relu(module) + if module == GuidedBackpropReLU.apply: + module_top._modules[idx] = torch.nn.ReLU() + except BaseException: + pass + + def __call__(self, input_img, target_category=None): + replace_all_layer_type_recursive(self.model, + torch.nn.ReLU, + GuidedBackpropReLUasModule()) + + if self.cuda: + input_img = input_img.cuda() + + input_img = input_img.requires_grad_(True) + + output = self.forward(input_img) + + if target_category is None: + target_category = np.argmax(output.cpu().data.numpy()) + + loss = output[0, target_category] + loss.backward(retain_graph=True) + + output = input_img.grad.cpu().data.numpy() + output = output[0, :, :, :] + output = output.transpose((1, 2, 0)) + + replace_all_layer_type_recursive(self.model, + GuidedBackpropReLUasModule, + torch.nn.ReLU()) + + return output diff --git a/PuzzleTuning/pytorch_grad_cam/layer_cam.py b/PuzzleTuning/pytorch_grad_cam/layer_cam.py new file mode 100644 index 0000000000000000000000000000000000000000..971443d798658d6c29ff9da54481511ac317a1b0 --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/layer_cam.py @@ -0,0 +1,36 @@ +import numpy as np +from pytorch_grad_cam.base_cam import BaseCAM +from pytorch_grad_cam.utils.svd_on_activations import get_2d_projection + +# https://ieeexplore.ieee.org/document/9462463 + + +class LayerCAM(BaseCAM): + def __init__( + self, + model, + target_layers, + use_cuda=False, + reshape_transform=None): + super( + LayerCAM, + self).__init__( + model, + target_layers, + use_cuda, + reshape_transform) + + def get_cam_image(self, + input_tensor, + target_layer, + target_category, + activations, + grads, + eigen_smooth): + spatial_weighted_activations = np.maximum(grads, 0) * activations + + if eigen_smooth: + cam = get_2d_projection(spatial_weighted_activations) + else: + cam = spatial_weighted_activations.sum(axis=1) + return cam diff --git a/PuzzleTuning/pytorch_grad_cam/score_cam.py b/PuzzleTuning/pytorch_grad_cam/score_cam.py new file mode 100644 index 0000000000000000000000000000000000000000..9865964d98dc379d6577539ee857bd87d2f33589 --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/score_cam.py @@ -0,0 +1,61 @@ +import torch +import tqdm +from pytorch_grad_cam.base_cam import BaseCAM + + +class ScoreCAM(BaseCAM): + def __init__( + self, + model, + target_layers, + use_cuda=False, + reshape_transform=None): + super(ScoreCAM, self).__init__(model, target_layers, use_cuda, + reshape_transform=reshape_transform) + + if len(target_layers) > 0: + print("Warning: You are using ScoreCAM with target layers, " + "however ScoreCAM will ignore them.") + + def get_cam_weights(self, + input_tensor, + target_layer, + target_category, + activations, + grads): + with torch.no_grad(): + upsample = torch.nn.UpsamplingBilinear2d( + size=input_tensor.shape[-2:]) + activation_tensor = torch.from_numpy(activations) + if self.cuda: + activation_tensor = activation_tensor.cuda() + + upsampled = upsample(activation_tensor) + + maxs = upsampled.view(upsampled.size(0), + upsampled.size(1), -1).max(dim=-1)[0] + mins = upsampled.view(upsampled.size(0), + upsampled.size(1), -1).min(dim=-1)[0] + maxs, mins = maxs[:, :, None, None], mins[:, :, None, None] + upsampled = (upsampled - mins) / (maxs - mins) + + input_tensors = input_tensor[:, None, + :, :] * upsampled[:, :, None, :, :] + + if hasattr(self, "batch_size"): + BATCH_SIZE = self.batch_size + else: + BATCH_SIZE = 16 + + scores = [] + for batch_index, tensor in enumerate(input_tensors): + category = target_category[batch_index] + for i in tqdm.tqdm(range(0, tensor.size(0), BATCH_SIZE)): + batch = tensor[i: i + BATCH_SIZE, :] + outputs = self.model(batch).cpu().numpy()[:, category] + scores.extend(outputs) + scores = torch.Tensor(scores) + scores = scores.view(activations.shape[0], activations.shape[1]) + + weights = torch.nn.Softmax(dim=-1)(scores).numpy() + return weights diff --git a/PuzzleTuning/pytorch_grad_cam/utils/__init__.py b/PuzzleTuning/pytorch_grad_cam/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..be80d48e4c3bfe648fdbfc8d34c72a420e18aca2 --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/utils/__init__.py @@ -0,0 +1,2 @@ +from pytorch_grad_cam.utils.image import deprocess_image,show_cam_on_image, preprocess_image +from pytorch_grad_cam.utils.svd_on_activations import get_2d_projection \ No newline at end of file diff --git a/PuzzleTuning/pytorch_grad_cam/utils/find_layers.py b/PuzzleTuning/pytorch_grad_cam/utils/find_layers.py new file mode 100644 index 0000000000000000000000000000000000000000..4b9e44590664fdc30e996f79bd1a3497db40e822 --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/utils/find_layers.py @@ -0,0 +1,30 @@ +def replace_layer_recursive(model, old_layer, new_layer): + for name, layer in model._modules.items(): + if layer == old_layer: + model._modules[name] = new_layer + return True + elif replace_layer_recursive(layer, old_layer, new_layer): + return True + return False + + +def replace_all_layer_type_recursive(model, old_layer_type, new_layer): + for name, layer in model._modules.items(): + if isinstance(layer, old_layer_type): + model._modules[name] = new_layer + replace_all_layer_type_recursive(layer, old_layer_type, new_layer) + + +def find_layer_types_recursive(model, layer_types): + def predicate(layer): + return type(layer) in layer_types + return find_layer_predicate_recursive(model, predicate) + + +def find_layer_predicate_recursive(model, predicate): + result = [] + for name, layer in model._modules.items(): + if predicate(layer): + result.append(layer) + result.extend(find_layer_predicate_recursive(layer, predicate)) + return result diff --git a/PuzzleTuning/pytorch_grad_cam/utils/image.py b/PuzzleTuning/pytorch_grad_cam/utils/image.py new file mode 100644 index 0000000000000000000000000000000000000000..8e91f9beea11b0faf51493be9d9cfb404f8d1f34 --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/utils/image.py @@ -0,0 +1,49 @@ +import cv2 +import numpy as np +import torch +from torchvision.transforms import Compose, Normalize, ToTensor + + +def preprocess_image(img: np.ndarray, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) -> torch.Tensor: + preprocessing = Compose([ + ToTensor(), + Normalize(mean=mean, std=std) + ]) + return preprocessing(img.copy()).unsqueeze(0) + + +def deprocess_image(img): + """ see https://github.com/jacobgil/keras-grad-cam/blob/master/grad-cam.py#L65 """ + img = img - np.mean(img) + img = img / (np.std(img) + 1e-5) + img = img * 0.1 + img = img + 0.5 + img = np.clip(img, 0, 1) + return np.uint8(img * 255) + + +def show_cam_on_image(img: np.ndarray, + mask: np.ndarray, + use_rgb: bool = False, + colormap: int = cv2.COLORMAP_JET) -> np.ndarray: + """ This function overlays the cam mask on the image as an heatmap. + By default the heatmap is in BGR format. + + :param img: The base image in RGB or BGR format. + :param mask: The cam mask. + :param use_rgb: Whether to use an RGB or BGR heatmap, this should be set to True if 'img' is in RGB format. + :param colormap: The OpenCV colormap to be used. + :returns: The default image with the cam overlay. + """ + heatmap = cv2.applyColorMap(np.uint8(255 * mask), colormap) + if use_rgb: + heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB) + heatmap = np.float32(heatmap) / 255 + + if np.max(img) > 1: + raise Exception( + "The input image should np.float32 in the range [0, 1]") + + cam = heatmap + img + cam = cam / np.max(cam) + return np.uint8(255 * cam) diff --git a/PuzzleTuning/pytorch_grad_cam/utils/svd_on_activations.py b/PuzzleTuning/pytorch_grad_cam/utils/svd_on_activations.py new file mode 100644 index 0000000000000000000000000000000000000000..a406aeea85617922e67270a70388256ac214e8e2 --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/utils/svd_on_activations.py @@ -0,0 +1,19 @@ +import numpy as np + + +def get_2d_projection(activation_batch): + # TBD: use pytorch batch svd implementation + activation_batch[np.isnan(activation_batch)] = 0 + projections = [] + for activations in activation_batch: + reshaped_activations = (activations).reshape( + activations.shape[0], -1).transpose() + # Centering before the SVD seems to be important here, + # Otherwise the image returned is negative + reshaped_activations = reshaped_activations - \ + reshaped_activations.mean(axis=0) + U, S, VT = np.linalg.svd(reshaped_activations, full_matrices=True) + projection = reshaped_activations @ VT[0, :] + projection = projection.reshape(activations.shape[1:]) + projections.append(projection) + return np.float32(projections) diff --git a/PuzzleTuning/pytorch_grad_cam/xgrad_cam.py b/PuzzleTuning/pytorch_grad_cam/xgrad_cam.py new file mode 100644 index 0000000000000000000000000000000000000000..81a920fe8b81bfb7bce9f317edfcc465c9bffd60 --- /dev/null +++ b/PuzzleTuning/pytorch_grad_cam/xgrad_cam.py @@ -0,0 +1,31 @@ +import numpy as np +from pytorch_grad_cam.base_cam import BaseCAM + + +class XGradCAM(BaseCAM): + def __init__( + self, + model, + target_layers, + use_cuda=False, + reshape_transform=None): + super( + XGradCAM, + self).__init__( + model, + target_layers, + use_cuda, + reshape_transform) + + def get_cam_weights(self, + input_tensor, + target_layer, + target_category, + activations, + grads): + sum_activations = np.sum(activations, axis=(2, 3)) + eps = 1e-7 + weights = grads * activations / \ + (sum_activations[:, :, None, None] + eps) + weights = weights.sum(axis=(2, 3)) + return weights diff --git a/PuzzleTuning/utils/Experiment_script_helper.py b/PuzzleTuning/utils/Experiment_script_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..5e0ad02940f6786ea025b6c5fbb84446ab68a23e --- /dev/null +++ b/PuzzleTuning/utils/Experiment_script_helper.py @@ -0,0 +1,883 @@ +""" +Experimental Script Generator Script ver: Oct 5th 16:30 + +for linux servers + +todo fix train and test alternatively +""" +import argparse +import os.path + + +def zero_trans_mystrlr_to_float(in_str): + # EG: '305' -> 0.0005 + front = '0.' + num_of_zero = int(in_str[0]) + end = in_str[-1] + for i in range(num_of_zero): + front = front + '0' + front = front + end + + out_float = float(front) + + return out_float + + +def zero_trans_floatlr_to_mystrlr(in_float): + # EG: 0.0005 -> '305' + in_string = "%.20f" % in_float + zero_counts = 0 + + for i in range(len(in_string) - 2): + # print(string[i+2]) + if in_string[i + 2] == '0': + zero_counts += 1 + else: + cut = i + break + + trans_output = str(zero_counts) + '0' + in_string[(cut + 2):] + + last_zeros = 0 + for i in trans_output[::-1]: + if i == '0': + last_zeros += 1 + else: + break + trans_output = trans_output[0:0 - last_zeros] + + return trans_output + + +def remove_nohup_ignoring_input_at_first_line(directory='./'): + """ + read the .sh files at the directory, remove the first line if it's 'nohup: ignoring input\n' + """ + for root, _, files in os.walk(directory): + for file_name in files: + if file_name.endswith(".sh"): + file_path = os.path.join(root, file_name) + + with open(file_path, 'r') as file: + lines = file.readlines() + # print(lines) + + modified_lines = [line for line in lines if line != "nohup: ignoring input\n"] + with open(file_path, 'w') as file: + file.writelines(modified_lines) + + print('file_path:', file_path, 'has been cleaned') + + +def concatenate_the_lines_from_several_files(directory='./', cat_file='0.sh'): + cat_file_path = os.path.join(directory, cat_file) + all_lines = ["#!/bin/sh\n", ] + + for root, _, files in os.walk(directory): + for file_name in files: + if file_name.endswith(".sh"): + file_path = os.path.join(root, file_name) + + with open(file_path, 'r') as file: + lines = file.readlines() + # print(lines) + + modified_lines = [line for line in lines if line != "#!/bin/sh\n"] + all_lines.extend(modified_lines) + print('file_path:', file_path, 'has taken') + + with open(cat_file_path, 'w') as file: + file.writelines(all_lines) + + +def print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_Token_num=20, + Prompt_input=False): + Pre_Trained_model_path = os.path.join(Pre_Trained_model_path_PATH, model_weight_name) + VPT_backbone_model_path = os.path.join(Pre_Trained_model_path_PATH, 'ViT_b16_224_Imagenet.pth') + if not Prompt_input: + # send a ViT model inside and then do the ViT + finetuning; + # In VPT versions: we build VPT backbone with the ViT weight, then do finetuning and prompting + # ViT + finetuning + print( + 'python Train.py --gpu_idx ' + GPU_idx + ' --edge_size 224 --data_augmentation_mode ' + data_augmentation_mode + + ' --lr ' + lr + ' --lrf ' + lrf + ' --enable_tensorboard --model_idx ViT_base_' + model_weight_idx + '_' + + lr_mystr + '_lf' + lrf_mystr + '_finetuning_' + dataset_name + '_CLS --dataroot ' + str(dataroot) + + ' --draw_root ' + draw_root + ' --Pre_Trained_model_path ' + Pre_Trained_model_path + + ' --model_path ' + save_model_PATH) + print( + 'python Test.py --gpu_idx ' + GPU_idx + ' --edge_size 224 --data_augmentation_mode ' + data_augmentation_mode + + ' --model_idx ViT_base_' + model_weight_idx + '_' + lr_mystr + '_lf' + lrf_mystr + '_finetuning_' + + dataset_name + '_CLS --dataroot ' + str(dataroot) + ' --draw_root ' + draw_root + ' --model_path ' + + save_model_PATH) + # VPT + prompting + print( + 'python Train.py --gpu_idx ' + GPU_idx + ' --edge_size 224 --data_augmentation_mode ' + data_augmentation_mode + + ' --lr ' + lr + ' --lrf ' + lrf + ' --enable_tensorboard --model_idx ViT_base_' + model_weight_idx + + '_PromptDeep_' + str(Prompt_Token_num) + '_' + lr_mystr + '_lf' + lrf_mystr + '_prompting_' + dataset_name + + '_CLS --PromptTuning Deep --Prompt_Token_num ' + str(Prompt_Token_num) + ' --dataroot ' + str( + dataroot) + ' --draw_root ' + draw_root + + ' --Pre_Trained_model_path ' + Pre_Trained_model_path + ' --model_path ' + save_model_PATH) + print( + 'python Test.py --gpu_idx ' + GPU_idx + ' --edge_size 224 --data_augmentation_mode ' + data_augmentation_mode + + ' --model_idx ViT_base_' + model_weight_idx + '_PromptDeep_' + str( + Prompt_Token_num) + '_' + lr_mystr + '_lf' + lrf_mystr + + '_prompting_' + dataset_name + '_CLS --PromptTuning Deep --Prompt_Token_num ' + str( + Prompt_Token_num) + ' --dataroot ' + str(dataroot) + ' --draw_root ' + + draw_root + ' --Pre_Trained_model_path ' + Pre_Trained_model_path + ' --model_path ' + save_model_PATH) + # VPT + finetuning + print( + 'python Train.py --gpu_idx ' + GPU_idx + ' --edge_size 224 --data_augmentation_mode ' + data_augmentation_mode + + ' --lr ' + lr + ' --lrf ' + lrf + ' --enable_tensorboard --model_idx ViT_base_' + model_weight_idx + + '_PromptDeep_' + str( + Prompt_Token_num) + '_' + lr_mystr + '_lf' + lrf_mystr + '_finetuning_' + dataset_name + + '_CLS --PromptTuning Deep --Prompt_Token_num ' + str( + Prompt_Token_num) + ' --PromptUnFreeze --dataroot ' + str(dataroot) + ' --draw_root ' + draw_root + + ' --Pre_Trained_model_path ' + Pre_Trained_model_path + ' --model_path ' + save_model_PATH) + print( + 'python Test.py --gpu_idx ' + GPU_idx + ' --edge_size 224 --data_augmentation_mode ' + data_augmentation_mode + + ' --model_idx ViT_base_' + model_weight_idx + '_PromptDeep_' + str( + Prompt_Token_num) + '_' + lr_mystr + '_lf' + lrf_mystr + + '_finetuning_' + dataset_name + '_CLS --PromptTuning Deep --Prompt_Token_num ' + str( + Prompt_Token_num) + ' --PromptUnFreeze --dataroot ' + str(dataroot) + + ' --draw_root ' + draw_root + ' --model_path ' + save_model_PATH) + else: + # send a VPT prompt state inside to build the prompt tokens + # we build VPT backbone with the ViT-timm weight, then do finetuning and prompting + # fixme notice here Pre_Trained_model_path is actually the trained prompt state path + # VPT + prompting + print( + 'python Train.py --gpu_idx ' + GPU_idx + ' --edge_size 224 --data_augmentation_mode ' + data_augmentation_mode + + ' --lr ' + lr + ' --lrf ' + lrf + ' --enable_tensorboard --model_idx ViT_base_' + model_weight_idx + + '_PromptDeep_' + str(Prompt_Token_num) + '_' + lr_mystr + '_lf' + lrf_mystr + '_prompting_' + dataset_name + + '_CLS --PromptTuning Deep --Prompt_Token_num ' + str(Prompt_Token_num) + ' --dataroot ' + str( + dataroot) + ' --draw_root ' + draw_root + + ' --Pre_Trained_model_path ' + VPT_backbone_model_path + ' --Prompt_state_path ' + Pre_Trained_model_path + ' --model_path ' + save_model_PATH) + print( + 'python Test.py --gpu_idx ' + GPU_idx + ' --edge_size 224 --data_augmentation_mode ' + data_augmentation_mode + + ' --model_idx ViT_base_' + model_weight_idx + '_PromptDeep_' + str( + Prompt_Token_num) + '_' + lr_mystr + '_lf' + lrf_mystr + + '_prompting_' + dataset_name + '_CLS --PromptTuning Deep --Prompt_Token_num ' + str( + Prompt_Token_num) + ' --dataroot ' + str(dataroot) + ' --draw_root ' + + draw_root + ' --Pre_Trained_model_path ' + VPT_backbone_model_path + ' --model_path ' + save_model_PATH) + # VPT + finetuning + print( + 'python Train.py --gpu_idx ' + GPU_idx + ' --edge_size 224 --data_augmentation_mode ' + data_augmentation_mode + + ' --lr ' + lr + ' --lrf ' + lrf + ' --enable_tensorboard --model_idx ViT_base_' + model_weight_idx + + '_PromptDeep_' + str( + Prompt_Token_num) + '_' + lr_mystr + '_lf' + lrf_mystr + '_finetuning_' + dataset_name + + '_CLS --PromptTuning Deep --Prompt_Token_num ' + str( + Prompt_Token_num) + ' --PromptUnFreeze --dataroot ' + str(dataroot) + ' --draw_root ' + draw_root + + ' --Pre_Trained_model_path ' + VPT_backbone_model_path + ' --Prompt_state_path ' + Pre_Trained_model_path + ' --model_path ' + save_model_PATH) + print( + 'python Test.py --gpu_idx ' + GPU_idx + ' --edge_size 224 --data_augmentation_mode ' + data_augmentation_mode + + ' --model_idx ViT_base_' + model_weight_idx + '_PromptDeep_' + str( + Prompt_Token_num) + '_' + lr_mystr + '_lf' + lrf_mystr + + '_finetuning_' + dataset_name + '_CLS --PromptTuning Deep --Prompt_Token_num ' + str( + Prompt_Token_num) + ' --PromptUnFreeze --dataroot ' + str(dataroot) + + ' --draw_root ' + draw_root + ' --model_path ' + save_model_PATH) + + print('') + + +def write_PuzzleTuning_comparison_script(lr_mystr, lrf_mystr, data_augmentation_mode, dataset_name, GPU_idx='0'): + """ + In PuzzleTuning comparison experiments we put + datasets at: --dataroot /root/autodl-tmp/datasets + Pre_Trained_model_path /root/autodl-tmp/pre_trained_models # output_models (not applicable for comparison) + Prompt_state_path (not applicable for comparison) /root/autodl-tmp/output_models + save the training model at: model_path /root/autodl-tmp/saved_models + draw_root /root/autodl-tmp/PuzzleTuning_Comparison/[*lr*_*lrf*_*dataset_name*] + + """ + dataroot_PATH = '/root/autodl-tmp/datasets' + Pre_Trained_model_path_PATH = '/root/autodl-tmp/pre_trained_models' + save_model_PATH = '/root/autodl-tmp/saved_models' + draw_root_PATH = '/root/autodl-tmp/PuzzleTuning_Comparison' + + data_augmentation_mode = str(data_augmentation_mode) + GPU_idx = str(GPU_idx) + + lr = str(zero_trans_mystrlr_to_float(lr_mystr)) + lrf = '0.' + str(lrf_mystr) + + experiment_idx = lr_mystr + '_lf' + lrf_mystr + '_' + dataset_name + + dataroot = os.path.join(dataroot_PATH, dataset_name + '_CLS') + draw_root = os.path.join(draw_root_PATH, experiment_idx) + + # PuzzleTuning official version: + # we pre-trained VPT prompt tokens, and use the timm ViT as backbone + print('#SAE-timm-start_promptstate') # SAE+VPT start with timm + model_weight_idx = 'ViT_base_timm_PuzzleTuning_SAE_E_199_promptstate' + model_weight_name = 'ViT_b16_224_timm_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=True) + + # Comparison methods: + + # For the comparison methods: we trained ViT, so we use ViT + ft first, + # and then, put it as vpt 's backbone in prompting and VPT finetuning. + print('#空白对比') + model_weight_idx = 'random' + model_weight_name = 'ViT_b16_224_Random_Init.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=False) + + print('#timm对比') + model_weight_idx = 'timm' + model_weight_name = 'ViT_b16_224_Imagenet.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=False) + + print('#MAEImageNet对比') + model_weight_idx = 'MAEImageNet' + model_weight_name = 'ViT_b16_224_MAEImageNet_Init.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=False) + + print('#mae对比') + model_weight_idx = 'timm_mae_CPIAm_E100' + model_weight_name = 'ViT_b16_224_timm_mae_ALL_100.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=False) + + print('#moco对比') + model_weight_idx = 'timm_moco_CPIAm_E100' + model_weight_name = 'ViT_b16_224_timm_moco_ALL_100.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=False) + + print('#dino对比') + model_weight_idx = 'timm_dino_CPIAm_E100' + model_weight_name = 'ViT_b16_224_timm_dino_ALL_100.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=False) + + print('#BYOL对比') + model_weight_idx = 'timm_BYOL_CPIAm_E50' + model_weight_name = 'ViT_b16_224_timm_BYOL_ALL_50.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=False) + + print('#GCMAE对比') + model_weight_idx = 'timm_GCMAE_CPIAm_E80' + model_weight_name = 'ViT_b16_224_timm_GCMAE_ALL_80.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=False) + + print('#SDMAE对比') + model_weight_idx = 'timm_SDMAE_CPIAm_E80' + model_weight_name = 'ViT_b16_224_timm_SDMAE_ALL_80.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=False) + + print('#SIMMIM对比') + model_weight_idx = 'timm_SIMMIM_CPIAm_E200' + model_weight_name = 'ViT_b16_224_timm_SIMMIM_ALL_200.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=False) + + print('#SIMCLR对比') + model_weight_idx = 'timm_SIMCLR_CPIAm_E100' + model_weight_name = 'ViT_b16_224_timm_SIMCLR_ALL_100.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=False) + + # Ablation versions: + + # For ablation SAE-ViT version, we pre-trained ViT, so we use ViT + ft first, + # and then, put it as vpt 's backbone in prompting and VPT finetuning. + print('#PuzzleTuning_SAE_ViT-CPIA对比') + model_weight_idx = 'timm_PuzzleTuning_SAE_E_199' + model_weight_name = 'ViT_b16_224_timm_PuzzleTuning_SAE_CPIAm_E_199.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=False) + + print('#SAE_fixp16fixr25-timm-start') # SAE_fixp16fixr25+ViT start with timm + model_weight_idx = 'ViT_base_timm_PuzzleTuning_SAE_fixp16fixr25_E_199' + model_weight_name = 'ViT_b16_224_timm_PuzzleTuning_SAE_fixp16fixr25_CPIAm_E_199.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=False) + + print('#SAE_fixp16ratiodecay-timm-start') # SAE_fixp16ratiodecay+ViT start with timm + model_weight_idx = 'ViT_base_timm_PuzzleTuning_SAE_fixp16ratiodecay_E_199' + model_weight_name = 'ViT_b16_224_timm_PuzzleTuning_SAE_fixp16ratiodecay_CPIAm_E_199.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=False) + + # For ablation SAE-VPT version, we pre-trained VPT prompt tokens, and use the timm ViT as backbone + print('#MAE-VPT_promptstate') # MAE+VPT + model_weight_idx = 'timm_mae_Prompt_CPIAm_E199_promptstate' + model_weight_name = 'ViT_b16_224_timm_PuzzleTuning_MAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=True) + + print('#SAE-MAE-start_promptstate') # SAE+VPT start with MAEImageNet + model_weight_idx = 'ViT_base_MAEImageNet_PuzzleTuning_SAE_E_199_promptstate' + model_weight_name = 'ViT_b16_224_MAEImageNet_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=True) + + print('#SAE-Random-start_promptstate') # SAE+VPT start with Random + model_weight_idx = 'ViT_base_Random_PuzzleTuning_SAE_E_199_promptstate' + model_weight_name = 'ViT_b16_224_Random_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=True) + + print('#SAE_fixp16fixr25-timm-start_promptstate') # SAE_fixp16fixr25+VPT start with timm + model_weight_idx = 'ViT_base_timm_PuzzleTuning_SAE_fixp16fixr25_E_199_promptstate' + model_weight_name = 'ViT_b16_224_timm_PuzzleTuning_SAE_fixp16fixr25_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=True) + + print('#SAE_fixp16ratiodecay-timm-start_promptstate') # SAE_fixp16ratiodecay+VPT start with timm + model_weight_idx = 'ViT_base_timm_PuzzleTuning_SAE_fixp16ratiodecay_E_199_promptstate' + model_weight_name = 'ViT_b16_224_timm_PuzzleTuning_SAE_fixp16ratiodecay_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth' + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=True) + + print('') + print('cd /home/pancreatic-cancer-diagnosis-tansformer/code/utils') + record_dir = os.path.join(draw_root, 'CSV_logs') + print('python check_log_json.py --enable_notify --draw_root ' + draw_root + ' --record_dir ' + record_dir) + print('cd /home/pancreatic-cancer-diagnosis-tansformer/code') + + +def write_additional_PuzzleTuning_comparison_script(add_idx, lr_mystr, lrf_mystr, data_augmentation_mode, dataset_name, + model_weight_idx='timm_mae_CPIAm_E100', + model_weight_name='ViT_b16_224_timm_mae_ALL_100.pth', + GPU_idx='0', Prompt_input=False): + """ + In PuzzleTuning comparison experiments we put + datasets at: --dataroot /root/autodl-tmp/datasets + Pre_Trained_model_path /root/autodl-tmp/pre_trained_models # output_models (not applicable for comparison) + Prompt_state_path (not applicable for comparison) /root/autodl-tmp/output_models + save the training model at: model_path /root/autodl-tmp/saved_models + draw_root /root/autodl-tmp/PuzzleTuning_Comparison/[*lr*_*lrf*_*dataset_name*] + + # fixme the additional experiments settings need to manually set!!! + in the additional experiments, we save the runs to + draw_root /root/autodl-tmp/runs/[*lr*_*lrf*_*dataset_name*] + and then copy a duplicates to /root/autodl-tmp/PuzzleTuning_Comparison/[*lr*_*lrf*_*dataset_name*] + + """ + dataroot_PATH = '/root/autodl-tmp/datasets' + Pre_Trained_model_path_PATH = '/root/autodl-tmp/pre_trained_models' + save_model_PATH = '/root/autodl-tmp/saved_models' + draw_root_PATH = '/root/autodl-tmp/runs' + copy_to_draw_root_PATH = '/root/autodl-tmp/PuzzleTuning_Comparison' + + data_augmentation_mode = str(data_augmentation_mode) + GPU_idx = str(GPU_idx) + + lr = str(zero_trans_mystrlr_to_float(lr_mystr)) + lrf = '0.' + str(lrf_mystr) + + experiment_idx = lr_mystr + '_lf' + lrf_mystr + '_' + dataset_name + add_experiment_idx = add_idx + '_' + lr_mystr + '_lf' + lrf_mystr + '_' + dataset_name + + dataroot = os.path.join(dataroot_PATH, dataset_name + '_CLS') + # additional exp runs path + draw_root = os.path.join(draw_root_PATH, add_experiment_idx) + # basic all exp runs path + copy_draw_root = os.path.join(copy_to_draw_root_PATH, experiment_idx) + + print('# Additional ' + add_idx) + print_a_PuzzleTuning_comparison_script(model_weight_idx, model_weight_name, lr, lrf, lr_mystr, lrf_mystr, + dataset_name, dataroot, draw_root, Pre_Trained_model_path_PATH, + save_model_PATH, data_augmentation_mode, GPU_idx, Prompt_input=Prompt_input) + print('') + print('cd /home/pancreatic-cancer-diagnosis-tansformer/code/utils') + # update the total record + print('') + print('cp -r ' + draw_root + '/*' + ' ' + copy_draw_root) + record_dir = os.path.join(copy_draw_root, 'CSV_logs') + print('python check_log_json.py --draw_root ' + copy_draw_root + ' --record_dir ' + record_dir) + + # update the additional runs and send to notify + record_dir = os.path.join(draw_root, add_experiment_idx) + print('python check_log_json.py --enable_notify --draw_root ' + draw_root + ' --record_dir ' + record_dir) + + print('cd /home/pancreatic-cancer-diagnosis-tansformer/code') + + +def write_CLS_script(model_idxs, data_augmentation_mode, edge_size, batch_size, lr, lrf, enable_tensorboard, + test_enable_attention_check, dataset_name, dataroot, model_path, draw_root): + data_augmentation_mode = str(data_augmentation_mode) + edge_size_ipt = str(edge_size) + batch_size = str(batch_size) + lr_name = zero_trans_floatlr_to_mystrlr(lr) + lr = str(lr) + lf_name = str(int(100 * lrf)) + lrf = str(lrf) + dataroot = dataroot + dataset_name + '_CLS' + + for model_idx in model_idxs: + + # alter the edge size for certain models + if model_idx in ['cross_former', 'convit', 'visformer', 'ViT_h']: + edge_size = '224' + else: + edge_size = edge_size_ipt + + if enable_tensorboard is True: + print('python Train.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_' + dataset_name + '_CLS --edge_size ' + edge_size + + ' --data_augmentation_mode ' + data_augmentation_mode + ' --batch_size ' + batch_size + + ' --lr ' + lr + ' --lrf ' + lrf + ' --enable_tensorboard --dataroot ' + dataroot + + ' --model_path ' + model_path + ' --draw_root ' + draw_root) + print('') + else: + print('python Train.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_' + dataset_name + '_CLS --edge_size ' + edge_size + + ' --data_augmentation_mode ' + data_augmentation_mode + ' --batch_size ' + batch_size + + ' --lr ' + lr + ' --lrf ' + lrf + ' --dataroot ' + dataroot + + ' --model_path ' + model_path + ' --draw_root ' + draw_root) + print('') + + for model_idx in model_idxs: + + # alter the edge size for certain models + if model_idx in ['cross_former', 'convit', 'visformer', 'ViT_h']: + edge_size = '224' + else: + edge_size = edge_size_ipt + + if test_enable_attention_check is True: + print('python Test.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_' + dataset_name + '_CLS --edge_size ' + edge_size + + ' --data_augmentation_mode ' + data_augmentation_mode + ' --enable_attention_check --dataroot ' + + dataroot + ' --model_path ' + model_path + ' --draw_root ' + draw_root) + print('') + else: + print('python Test.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_' + dataset_name + '_CLS --edge_size ' + edge_size + + ' --data_augmentation_mode ' + data_augmentation_mode + ' --dataroot ' + + dataroot + ' --model_path ' + model_path + ' --draw_root ' + draw_root) + print('') + + +def write_CLS_AUG_script(model_idx, augmentation_names, data_augmentation_mode, edge_size, batch_size, lr, lrf, + test_enable_attention_check, enable_tensorboard, dataset_name, dataroot, model_path, + draw_root): + data_augmentation_mode = str(data_augmentation_mode) + data_augmentation_mode = str(data_augmentation_mode) + edge_size = str(edge_size) + batch_size = str(batch_size) + lr_name = zero_trans_floatlr_to_mystrlr(lr) + lr = str(lr) + lf_name = str(int(100 * lrf)) + lrf = str(lrf) + dataroot = dataroot + dataset_name + '_CLS' + + for augmentation_name in augmentation_names: + if enable_tensorboard is True: + print('python Train.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_' + dataset_name + '_' + augmentation_name + '_CLS' + + ' --augmentation_name ' + augmentation_name + ' --edge_size ' + edge_size + + ' --data_augmentation_mode ' + data_augmentation_mode + ' --batch_size ' + batch_size + + ' --lr ' + lr + ' --lrf ' + lrf + ' --enable_tensorboard --dataroot ' + dataroot + + ' --model_path ' + model_path + ' --draw_root ' + draw_root) + print('') + else: + print('python Train.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_' + dataset_name + '_' + augmentation_name + '_CLS' + + ' --augmentation_name ' + augmentation_name + ' --edge_size ' + edge_size + + ' --data_augmentation_mode ' + data_augmentation_mode + ' --batch_size ' + batch_size + + ' --lr ' + lr + ' --lrf ' + lrf + ' --dataroot ' + dataroot + + ' --model_path ' + model_path + ' --draw_root ' + draw_root) + print('') + + for augmentation_name in augmentation_names: + if test_enable_attention_check is True: + print('python Test.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_' + dataset_name + '_' + augmentation_name + '_CLS' + + ' --edge_size ' + edge_size + ' --data_augmentation_mode ' + data_augmentation_mode + + ' --enable_attention_check --dataroot ' + dataroot + ' --model_path ' + model_path + + ' --draw_root ' + draw_root) + print('') + else: + print('python Test.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_' + dataset_name + '_' + augmentation_name + '_CLS' + + ' --edge_size ' + edge_size + ' --data_augmentation_mode ' + data_augmentation_mode + ' --dataroot ' + + dataroot + ' --model_path ' + model_path + ' --draw_root ' + draw_root) + print('') + + +def write_MIL_script(model_idxs, data_augmentation_mode, edge_size, batch_size, patch_size, lr, lrf, enable_tensorboard, + test_enable_attention_check, dataset_name, dataroot, model_path, draw_root, imaging_root=None): + # imaging_root 是放画图的检查的路径,可以和draw一样 + if imaging_root == None: + imaging_root = draw_root + + data_augmentation_mode = str(data_augmentation_mode) + edge_size = str(edge_size) + batch_size = str(batch_size) + patch_size = str(patch_size) + lr_name = zero_trans_floatlr_to_mystrlr(lr) + lr = str(lr) + lf_name = str(int(100 * lrf)) + lrf = str(lrf) + dataroot = dataroot + dataset_name + '_MIL' + CLS_dataroot = dataroot + dataset_name + '_CLS' + + for model_idx in model_idxs: + if enable_tensorboard is True: + print('python MIL_train.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_p' + patch_size + '_' + dataset_name + + '_MIL --edge_size ' + edge_size + ' --data_augmentation_mode ' + data_augmentation_mode + + ' --batch_size ' + batch_size + ' --patch_size ' + patch_size + ' --lr ' + lr + ' --lrf ' + + lrf + ' --enable_tensorboard --dataroot ' + dataroot + ' --model_path ' + model_path + + ' --draw_root ' + draw_root) + print('') + else: + print('python MIL_train.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_p' + patch_size + '_' + dataset_name + + '_MIL --edge_size ' + edge_size + ' --data_augmentation_mode ' + data_augmentation_mode + + ' --batch_size ' + batch_size + ' --patch_size ' + patch_size + ' --lr ' + lr + ' --lrf ' + + lrf + ' --dataroot ' + dataroot + ' --model_path ' + model_path + ' --draw_root ' + draw_root) + print('') + + for model_idx in model_idxs: + print('python MIL_test.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_p' + patch_size + '_' + dataset_name + + '_MIL --edge_size ' + edge_size + ' --patch_size ' + patch_size + + ' --batch_size 1 --data_augmentation_mode ' + data_augmentation_mode + ' --dataroot ' + + dataroot + ' --model_path ' + model_path + ' --draw_root ' + draw_root) + print('') + + if test_enable_attention_check is True: # 设置多个batch的实验 + print('python Test.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_p' + patch_size + '_' + dataset_name + + '_MIL --edge_size ' + edge_size + ' --data_augmentation_mode ' + data_augmentation_mode + + ' --MIL_Stripe --enable_attention_check --check_minibatch 10' + + ' --dataroot ' + CLS_dataroot + ' --model_path ' + model_path + + ' --draw_root ' + imaging_root) + print('') + print('python MIL_test.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_p' + patch_size + '_' + dataset_name + + '_MIL --shuffle_attention_check --MIL_Stripe --edge_size ' + edge_size + + ' --data_augmentation_mode ' + data_augmentation_mode + + ' --shuffle_dataloader --batch_size 4 --check_minibatch 10' + ' --patch_size ' + patch_size + + ' --dataroot ' + dataroot + ' --model_path ' + model_path + + ' --draw_root ' + imaging_root) + print('') + print('python MIL_test.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_p' + patch_size + '_' + dataset_name + + '_MIL --shuffle_attention_check --MIL_Stripe --edge_size ' + edge_size + + ' --data_augmentation_mode ' + data_augmentation_mode + + ' --batch_size 4 --check_minibatch 10' + ' --patch_size ' + patch_size + + ' --dataroot ' + dataroot + ' --model_path ' + model_path + + ' --draw_root ' + imaging_root) + print('') + print('python MIL_test.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_p' + patch_size + '_' + dataset_name + + '_MIL --shuffle_attention_check --MIL_Stripe --edge_size ' + edge_size + + ' --data_augmentation_mode ' + data_augmentation_mode + + ' --batch_size 1 --check_minibatch 10' + ' --patch_size ' + patch_size + + ' --dataroot ' + dataroot + ' --model_path ' + model_path + + ' --draw_root ' + imaging_root) + print('') + + else: + print('python Test.py --model_idx ' + model_idx + '_' + edge_size + '_' + lr_name + + '_PT_lf' + lf_name + '_b' + batch_size + '_p' + patch_size + '_' + dataset_name + + '_MIL --edge_size ' + edge_size + ' --data_augmentation_mode ' + data_augmentation_mode + + ' --MIL_Stripe --dataroot ' + CLS_dataroot + ' --model_path ' + model_path + + ' --draw_root ' + draw_root) + print('') + + +''' +if __name__ == '__main__': + + print('#!/bin/sh') + print('') + # CLS-MIL调参的第一步是使用一个经验参数进行简单摸索,看看大家结果大概是多少,同时和文献进行对比 + # 首先摸索CLS对比实验结果 + model_idxs = ['ViT', 'vgg16', 'vgg19', 'mobilenetv3', 'inceptionv3', 'xception', + 'ResNet50', 'efficientnet_b3', 'swin_b', 'ResN50_ViT', 'conformer', 'cross_former'] + + batch_size = 8 + dataset_name = 'NCT-CRC-HE-100K' + + write_CLS_script(model_idxs=model_idxs, + data_augmentation_mode=3, + edge_size=384, + batch_size=batch_size, + lr=0.000007, + lrf=0.35, + enable_tensorboard=True, + test_enable_attention_check=True, + dataset_name=dataset_name, + dataroot='/root/autodl-tmp/datasets/', + model_path='/root/autodl-tmp/saved_models', + draw_root='/root/autodl-tmp/runs') + + # 正式实验的时候,后面还需要做各种MIL的消融实验 + # TODO 更多write_MIL_script + # 其次摸索CLS+特定模型vit+不同数据增强 对比实验结果 + augmentation_names = ['Cutout', 'Mixup', 'CutMix'] + write_CLS_AUG_script(model_idx='ViT', + augmentation_names=augmentation_names, + data_augmentation_mode=3, + edge_size=384, + batch_size=batch_size, + lr=0.000007, + lrf=0.35, + enable_tensorboard=True, + test_enable_attention_check=True, + dataset_name=dataset_name, + dataroot='/root/autodl-tmp/datasets/', + model_path='/root/autodl-tmp/saved_models', + draw_root='/root/autodl-tmp/runs') + + # 最后摸索MIL+ViT的实验结果 + MIL_model_idxs = ['ViT', ] + # MIL ablations + write_MIL_script(model_idxs=MIL_model_idxs, + data_augmentation_mode=3, + edge_size=384, + batch_size=batch_size, + patch_size=16, + lr=0.000007, + lrf=0.35, + enable_tensorboard=True, + test_enable_attention_check=False, + dataset_name=dataset_name, + dataroot='/root/autodl-tmp/datasets/', + model_path='/root/autodl-tmp/saved_models', + draw_root='/root/autodl-tmp/runs', + imaging_root='/root/autodl-tmp/imaging_results') + write_MIL_script(model_idxs=MIL_model_idxs, + data_augmentation_mode=3, + edge_size=384, + batch_size=batch_size, + patch_size=64, + lr=0.000007, + lrf=0.35, + enable_tensorboard=True, + test_enable_attention_check=False, + dataset_name=dataset_name, + dataroot='/root/autodl-tmp/datasets/', + model_path='/root/autodl-tmp/saved_models', + draw_root='/root/autodl-tmp/runs', + imaging_root='/root/autodl-tmp/imaging_results') + write_MIL_script(model_idxs=MIL_model_idxs, + data_augmentation_mode=3, + edge_size=384, + batch_size=batch_size, + patch_size=48, + lr=0.000007, + lrf=0.35, + enable_tensorboard=True, + test_enable_attention_check=False, + dataset_name=dataset_name, + dataroot='/root/autodl-tmp/datasets/', + model_path='/root/autodl-tmp/saved_models', + draw_root='/root/autodl-tmp/runs', + imaging_root='/root/autodl-tmp/imaging_results') + write_MIL_script(model_idxs=MIL_model_idxs, + data_augmentation_mode=3, + edge_size=384, + batch_size=batch_size, + patch_size=96, + lr=0.000007, + lrf=0.35, + enable_tensorboard=True, + test_enable_attention_check=False, + dataset_name=dataset_name, + dataroot='/root/autodl-tmp/datasets/', + model_path='/root/autodl-tmp/saved_models', + draw_root='/root/autodl-tmp/runs', + imaging_root='/root/autodl-tmp/imaging_results') + write_MIL_script(model_idxs=MIL_model_idxs, + data_augmentation_mode=3, + edge_size=384, + batch_size=batch_size, + patch_size=128, + lr=0.000007, + lrf=0.35, + enable_tensorboard=True, + test_enable_attention_check=False, + dataset_name=dataset_name, + dataroot='/root/autodl-tmp/datasets/', + model_path='/root/autodl-tmp/saved_models', + draw_root='/root/autodl-tmp/runs', + imaging_root='/root/autodl-tmp/imaging_results') + + # 调参实验的时候,先调MIL到最好,然后用参数去跑CLS实验看结果 + + print('cd /home/pancreatic-cancer-diagnosis-tansformer/code/utils') + print('') + print( + 'python check_log_json.py --enable_notify --draw_root /root/autodl-tmp/runs --record_dir /root/autodl-tmp/CSV_logs') + print('') + print('shutdown') +''' + + +def get_args_parser(): + parser = argparse.ArgumentParser(description='Automatically write shell script for training') + + # Model Name or index + parser.add_argument('--lr_mystr', default=None, type=str, help='Model lr EG: 506 -> 0.000006') + parser.add_argument('--lrf_mystr', default=None, type=str, help='Model lrf EG: 50 -> cosine decay to 50%') + parser.add_argument('--data_augmentation_mode', default=None, type=str, help='ROSE,pRCC:0; CAM16,WBC:3') + parser.add_argument('--dataset_name', default=None, type=str, help='ROSE,pRCC,CAM16,WBC ?') + parser.add_argument('--GPU_idx', default='0', type=str, help='Experiment GPU_idx EG: 0') + + return parser + + +if __name__ == '__main__': + parser = get_args_parser() + args = parser.parse_args() + + print('#!/bin/sh') + print('') + # add DropPos-CPIA + write_additional_PuzzleTuning_comparison_script(add_idx='DropPos-CPIA', lr_mystr=args.lr_mystr, + lrf_mystr=args.lrf_mystr, + data_augmentation_mode=args.data_augmentation_mode, + dataset_name=args.dataset_name, + model_weight_idx='timm_DropPos_CPIAm_E200', + model_weight_name='ViT_b16_224_timm_DropPos_ALL_200.pth', + GPU_idx=args.GPU_idx, Prompt_input=False) + + ''' + # add MAE-CPIA + write_additional_PuzzleTuning_comparison_script(add_idx='MAE-CPIA', lr_mystr=args.lr_mystr, + lrf_mystr=args.lrf_mystr, + data_augmentation_mode=args.data_augmentation_mode, + dataset_name=args.dataset_name, + model_weight_idx='timm_mae_CPIAm_E100', + model_weight_name='ViT_b16_224_timm_mae_ALL_100.pth', + GPU_idx=args.GPU_idx, Prompt_input=False) + # add SDMAE-CPIA + write_additional_PuzzleTuning_comparison_script(add_idx='SDMAE-CPIA', lr_mystr=args.lr_mystr, + lrf_mystr=args.lrf_mystr, + data_augmentation_mode=args.data_augmentation_mode, + dataset_name=args.dataset_name, + model_weight_idx='timm_SDMAE_CPIAm_E80', + model_weight_name='ViT_b16_224_timm_SDMAE_ALL_80.pth', + GPU_idx=args.GPU_idx, Prompt_input=False) + # add GCMAE-CPIA + write_additional_PuzzleTuning_comparison_script(add_idx='GCMAE-CPIA', lr_mystr=args.lr_mystr, + lrf_mystr=args.lrf_mystr, + data_augmentation_mode=args.data_augmentation_mode, + dataset_name=args.dataset_name, + model_weight_idx='timm_GCMAE_CPIAm_E80', + model_weight_name='ViT_b16_224_timm_GCMAE_ALL_80.pth', + GPU_idx=args.GPU_idx, Prompt_input=False) + # add JIGSAW-CPIA + write_additional_PuzzleTuning_comparison_script(add_idx='JIGSAW-CPIA', lr_mystr=args.lr_mystr, + lrf_mystr=args.lrf_mystr, + data_augmentation_mode=args.data_augmentation_mode, + dataset_name=args.dataset_name, + model_weight_idx='timm_JIGSAW_CPIAm_E50', + model_weight_name='ViT_b16_224_timm_JIGSAW_ALL_50.pth', + GPU_idx=args.GPU_idx, Prompt_input=False) + + # add DropPos-CPIA + write_additional_PuzzleTuning_comparison_script(add_idx='DropPos-CPIA', lr_mystr=args.lr_mystr, + lrf_mystr=args.lrf_mystr, + data_augmentation_mode=args.data_augmentation_mode, + dataset_name=args.dataset_name, + model_weight_idx='timm_DropPos_CPIAm_E200', + model_weight_name='ViT_b16_224_timm_DropPos_ALL_200.pth', + GPU_idx=args.GPU_idx, Prompt_input=False) + + # add MAE+VPT + write_additional_PuzzleTuning_comparison_script(add_idx='MAE-VPT_promptstate', + lr_mystr=args.lr_mystr, + lrf_mystr=args.lrf_mystr, + data_augmentation_mode=args.data_augmentation_mode, + dataset_name=args.dataset_name, + model_weight_idx='timm_mae_Prompt_CPIAm_E199_promptstate', + model_weight_name='ViT_b16_224_timm_PuzzleTuning_MAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth', + GPU_idx='0', Prompt_input=True) + # add SAE-MAE-start + write_additional_PuzzleTuning_comparison_script(add_idx='SAE-MAE-start_promptstate', + lr_mystr=args.lr_mystr, + lrf_mystr=args.lrf_mystr, + data_augmentation_mode=args.data_augmentation_mode, + dataset_name=args.dataset_name, + model_weight_idx='ViT_base_MAEImageNet_PuzzleTuning_SAE_E_199_promptstate', + model_weight_name='ViT_b16_224_MAEImageNet_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth', + GPU_idx='0', Prompt_input=True) + # add SAE-Random-start + write_additional_PuzzleTuning_comparison_script(add_idx='SAE-Random-start_promptstate', + lr_mystr=args.lr_mystr, + lrf_mystr=args.lrf_mystr, + data_augmentation_mode=args.data_augmentation_mode, + dataset_name=args.dataset_name, + model_weight_idx='ViT_base_Random_PuzzleTuning_SAE_E_199_promptstate', + model_weight_name='ViT_b16_224_Random_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth', + GPU_idx='0', Prompt_input=True) + + # add PuzzleTuning_SAE_ViT_to_VPT-CPIA + write_additional_PuzzleTuning_comparison_script(add_idx='PuzzleTuning_SAE_ViT-CPIA', lr_mystr=args.lr_mystr, + lrf_mystr=args.lrf_mystr, + data_augmentation_mode=args.data_augmentation_mode, + dataset_name=args.dataset_name, + model_weight_idx='timm_PuzzleTuning_SAE_E_199', + model_weight_name='ViT_b16_224_timm_PuzzleTuning_SAE_CPIAm_E_199.pth', + GPU_idx=args.GPU_idx, Prompt_input=False) + ''' + + # rewrite all + ''' + write_PuzzleTuning_comparison_script(lr_mystr=args.lr_mystr, lrf_mystr=args.lrf_mystr, + data_augmentation_mode=args.data_augmentation_mode, + dataset_name=args.dataset_name, GPU_idx=args.GPU_idx) + ''' + + + ''' + we can use the following codes to generates the additional exp scripts + + # read and auto generate task info + import os + path='/root/autodl-tmp/PuzzleTuning_Comparison' + data_augmentation_dic = {'ROSE': '0', 'pRCC': '0', 'CAM16': '3', 'WBC': '3'} + for exp_root in os.listdir(path): + out_sh_name = exp_root + '.sh' + lr_mystr = exp_root.split('_')[0] + lrf_mystr = exp_root.split('_')[1].split('lf')[-1] + dataset_name = exp_root.split('_')[-1] + data_augmentation_mode = data_augmentation_dic[dataset_name] + print('nohup python Experiment_script_helper.py --lr_mystr ' + lr_mystr + ' --lrf_mystr ' + lrf_mystr + + ' --data_augmentation_mode ' + data_augmentation_mode + ' --dataset_name ' + dataset_name + ' > ' + + out_sh_name + ' 2>&1 &') + + # then, we use the shell to run this code with the generated lines + + # the generate sh files has a nohup line at their first lines, so we can use this to erase + remove_nohup_ignoring_input_at_first_line(directory='./') + + # we can use the func to combine the sh files: + concatenate_the_lines_from_several_files(directory='./', cat_file='0.sh') + ''' diff --git a/PuzzleTuning/utils/Make_experiments_sh_with_helper.py b/PuzzleTuning/utils/Make_experiments_sh_with_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..5430280561b4229d6bf5a14b448863304bd30a20 --- /dev/null +++ b/PuzzleTuning/utils/Make_experiments_sh_with_helper.py @@ -0,0 +1,181 @@ +import os + +def explore_the_experimetns_and_generate_nohup_lines(path='/root/autodl-tmp/PuzzleTuning_Comparison'): + data_augmentation_dic = {'ROSE': '0', 'pRCC': '0', 'CAM16': '3', 'WBC': '3'} + for exp_root in os.listdir(path): + out_sh_name = exp_root + '.sh' + lr_mystr = exp_root.split('_')[0] + lrf_mystr = exp_root.split('_')[1].split('lf')[-1] + dataset_name = exp_root.split('_')[-1] + data_augmentation_mode = data_augmentation_dic[dataset_name] + print('nohup python Experiment_script_helper.py --lr_mystr ' + lr_mystr + ' --lrf_mystr ' + lrf_mystr + + ' --data_augmentation_mode ' + data_augmentation_mode + ' --dataset_name ' + dataset_name + ' > ' + + out_sh_name + ' 2>&1 &') + + +explore_the_experimetns_and_generate_nohup_lines('/Users/zhangtianyi/Downloads/PuzzleTuning_Comparison') +''' +nohup python Experiment_script_helper.py --lr_mystr 408 --lrf_mystr 25 --data_augmentation_mode 0 --dataset_name ROSE > 408_lf25_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 607 --lrf_mystr 05 --data_augmentation_mode 3 --dataset_name WBC > 607_lf05_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 503 --lrf_mystr 40 --data_augmentation_mode 0 --dataset_name ROSE > 503_lf40_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 605 --lrf_mystr 50 --data_augmentation_mode 0 --dataset_name ROSE > 605_lf50_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 606 --lrf_mystr 05 --data_augmentation_mode 0 --dataset_name ROSE > 606_lf05_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 503 --lrf_mystr 05 --data_augmentation_mode 3 --dataset_name CAM16 > 503_lf05_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 503 --lrf_mystr 40 --data_augmentation_mode 3 --dataset_name CAM16 > 503_lf40_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 504 --lrf_mystr 25 --data_augmentation_mode 0 --dataset_name pRCC > 504_lf25_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 509 --lrf_mystr 05 --data_augmentation_mode 0 --dataset_name pRCC > 509_lf05_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 507 --lrf_mystr 50 --data_augmentation_mode 3 --dataset_name CAM16 > 507_lf50_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 607 --lrf_mystr 10 --data_augmentation_mode 3 --dataset_name WBC > 607_lf10_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 401 --lrf_mystr 35 --data_augmentation_mode 0 --dataset_name pRCC > 401_lf35_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 402 --lrf_mystr 50 --data_augmentation_mode 3 --dataset_name CAM16 > 402_lf50_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 305 --lrf_mystr 05 --data_augmentation_mode 0 --dataset_name pRCC > 305_lf05_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 509 --lrf_mystr 25 --data_augmentation_mode 0 --dataset_name pRCC > 509_lf25_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 502 --lrf_mystr 50 --data_augmentation_mode 0 --dataset_name ROSE > 502_lf50_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 504 --lrf_mystr 05 --data_augmentation_mode 0 --dataset_name pRCC > 504_lf05_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 507 --lrf_mystr 50 --data_augmentation_mode 0 --dataset_name pRCC > 507_lf50_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 504 --lrf_mystr 30 --data_augmentation_mode 0 --dataset_name ROSE > 504_lf30_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 304 --lrf_mystr 35 --data_augmentation_mode 0 --dataset_name pRCC > 304_lf35_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 307 --lrf_mystr 20 --data_augmentation_mode 0 --dataset_name pRCC > 307_lf20_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 408 --lrf_mystr 40 --data_augmentation_mode 3 --dataset_name WBC > 408_lf40_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 605 --lrf_mystr 50 --data_augmentation_mode 3 --dataset_name WBC > 605_lf50_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 503 --lrf_mystr 15 --data_augmentation_mode 0 --dataset_name pRCC > 503_lf15_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 303 --lrf_mystr 10 --data_augmentation_mode 0 --dataset_name ROSE > 303_lf10_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 302 --lrf_mystr 15 --data_augmentation_mode 0 --dataset_name pRCC > 302_lf15_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 506 --lrf_mystr 20 --data_augmentation_mode 0 --dataset_name pRCC > 506_lf20_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 608 --lrf_mystr 25 --data_augmentation_mode 0 --dataset_name pRCC > 608_lf25_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 306 --lrf_mystr 10 --data_augmentation_mode 0 --dataset_name pRCC > 306_lf10_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 501 --lrf_mystr 50 --data_augmentation_mode 0 --dataset_name pRCC > 501_lf50_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 506 --lrf_mystr 35 --data_augmentation_mode 0 --dataset_name ROSE > 506_lf35_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 506 --lrf_mystr 40 --data_augmentation_mode 3 --dataset_name CAM16 > 506_lf40_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 503 --lrf_mystr 10 --data_augmentation_mode 3 --dataset_name WBC > 503_lf10_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 404 --lrf_mystr 25 --data_augmentation_mode 3 --dataset_name WBC > 404_lf25_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 609 --lrf_mystr 35 --data_augmentation_mode 3 --dataset_name WBC > 609_lf35_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 609 --lrf_mystr 20 --data_augmentation_mode 3 --dataset_name WBC > 609_lf20_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 404 --lrf_mystr 30 --data_augmentation_mode 3 --dataset_name WBC > 404_lf30_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 606 --lrf_mystr 15 --data_augmentation_mode 3 --dataset_name WBC > 606_lf15_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 607 --lrf_mystr 15 --data_augmentation_mode 3 --dataset_name WBC > 607_lf15_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 507 --lrf_mystr 30 --data_augmentation_mode 0 --dataset_name pRCC > 507_lf30_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 303 --lrf_mystr 05 --data_augmentation_mode 0 --dataset_name pRCC > 303_lf05_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 508 --lrf_mystr 10 --data_augmentation_mode 3 --dataset_name CAM16 > 508_lf10_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 409 --lrf_mystr 25 --data_augmentation_mode 3 --dataset_name WBC > 409_lf25_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 501 --lrf_mystr 15 --data_augmentation_mode 0 --dataset_name ROSE > 501_lf15_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 306 --lrf_mystr 20 --data_augmentation_mode 3 --dataset_name CAM16 > 306_lf20_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 305 --lrf_mystr 15 --data_augmentation_mode 0 --dataset_name pRCC > 305_lf15_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 604 --lrf_mystr 35 --data_augmentation_mode 3 --dataset_name WBC > 604_lf35_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 407 --lrf_mystr 10 --data_augmentation_mode 3 --dataset_name WBC > 407_lf10_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 505 --lrf_mystr 50 --data_augmentation_mode 3 --dataset_name CAM16 > 505_lf50_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 409 --lrf_mystr 25 --data_augmentation_mode 0 --dataset_name ROSE > 409_lf25_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 407 --lrf_mystr 05 --data_augmentation_mode 3 --dataset_name WBC > 407_lf05_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 406 --lrf_mystr 05 --data_augmentation_mode 3 --dataset_name WBC > 406_lf05_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 507 --lrf_mystr 40 --data_augmentation_mode 0 --dataset_name pRCC > 507_lf40_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 307 --lrf_mystr 50 --data_augmentation_mode 0 --dataset_name pRCC > 307_lf50_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 301 --lrf_mystr 25 --data_augmentation_mode 3 --dataset_name CAM16 > 301_lf25_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 603 --lrf_mystr 50 --data_augmentation_mode 3 --dataset_name CAM16 > 603_lf50_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 503 --lrf_mystr 50 --data_augmentation_mode 0 --dataset_name ROSE > 503_lf50_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 508 --lrf_mystr 10 --data_augmentation_mode 0 --dataset_name ROSE > 508_lf10_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 508 --lrf_mystr 50 --data_augmentation_mode 3 --dataset_name CAM16 > 508_lf50_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 304 --lrf_mystr 05 --data_augmentation_mode 3 --dataset_name CAM16 > 304_lf05_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 304 --lrf_mystr 40 --data_augmentation_mode 3 --dataset_name CAM16 > 304_lf40_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 606 --lrf_mystr 15 --data_augmentation_mode 0 --dataset_name ROSE > 606_lf15_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 608 --lrf_mystr 50 --data_augmentation_mode 3 --dataset_name WBC > 608_lf50_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 609 --lrf_mystr 50 --data_augmentation_mode 3 --dataset_name WBC > 609_lf50_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 509 --lrf_mystr 25 --data_augmentation_mode 3 --dataset_name CAM16 > 509_lf25_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 607 --lrf_mystr 40 --data_augmentation_mode 3 --dataset_name CAM16 > 607_lf40_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 503 --lrf_mystr 20 --data_augmentation_mode 3 --dataset_name CAM16 > 503_lf20_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 505 --lrf_mystr 15 --data_augmentation_mode 3 --dataset_name WBC > 505_lf15_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 508 --lrf_mystr 35 --data_augmentation_mode 3 --dataset_name CAM16 > 508_lf35_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 608 --lrf_mystr 15 --data_augmentation_mode 0 --dataset_name pRCC > 608_lf15_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 506 --lrf_mystr 25 --data_augmentation_mode 0 --dataset_name ROSE > 506_lf25_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 401 --lrf_mystr 05 --data_augmentation_mode 3 --dataset_name CAM16 > 401_lf05_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 501 --lrf_mystr 40 --data_augmentation_mode 0 --dataset_name pRCC > 501_lf40_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 608 --lrf_mystr 40 --data_augmentation_mode 3 --dataset_name WBC > 608_lf40_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 506 --lrf_mystr 20 --data_augmentation_mode 3 --dataset_name CAM16 > 506_lf20_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 503 --lrf_mystr 25 --data_augmentation_mode 0 --dataset_name pRCC > 503_lf25_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 609 --lrf_mystr 10 --data_augmentation_mode 0 --dataset_name ROSE > 609_lf10_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 509 --lrf_mystr 05 --data_augmentation_mode 3 --dataset_name CAM16 > 509_lf05_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 305 --lrf_mystr 15 --data_augmentation_mode 3 --dataset_name CAM16 > 305_lf15_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 505 --lrf_mystr 50 --data_augmentation_mode 0 --dataset_name ROSE > 505_lf50_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 608 --lrf_mystr 35 --data_augmentation_mode 0 --dataset_name pRCC > 608_lf35_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 402 --lrf_mystr 10 --data_augmentation_mode 3 --dataset_name CAM16 > 402_lf10_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 408 --lrf_mystr 10 --data_augmentation_mode 3 --dataset_name WBC > 408_lf10_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 306 --lrf_mystr 15 --data_augmentation_mode 3 --dataset_name CAM16 > 306_lf15_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 507 --lrf_mystr 15 --data_augmentation_mode 0 --dataset_name pRCC > 507_lf15_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 508 --lrf_mystr 50 --data_augmentation_mode 0 --dataset_name pRCC > 508_lf50_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 503 --lrf_mystr 10 --data_augmentation_mode 0 --dataset_name pRCC > 503_lf10_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 302 --lrf_mystr 25 --data_augmentation_mode 0 --dataset_name ROSE > 302_lf25_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 502 --lrf_mystr 15 --data_augmentation_mode 0 --dataset_name ROSE > 502_lf15_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 608 --lrf_mystr 20 --data_augmentation_mode 0 --dataset_name pRCC > 608_lf20_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 509 --lrf_mystr 40 --data_augmentation_mode 0 --dataset_name pRCC > 509_lf40_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 504 --lrf_mystr 25 --data_augmentation_mode 3 --dataset_name WBC > 504_lf25_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 301 --lrf_mystr 50 --data_augmentation_mode 0 --dataset_name ROSE > 301_lf50_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 302 --lrf_mystr 05 --data_augmentation_mode 0 --dataset_name ROSE > 302_lf05_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 508 --lrf_mystr 20 --data_augmentation_mode 3 --dataset_name CAM16 > 508_lf20_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 506 --lrf_mystr 05 --data_augmentation_mode 0 --dataset_name pRCC > 506_lf05_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 301 --lrf_mystr 30 --data_augmentation_mode 3 --dataset_name WBC > 301_lf30_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 507 --lrf_mystr 05 --data_augmentation_mode 3 --dataset_name CAM16 > 507_lf05_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 307 --lrf_mystr 05 --data_augmentation_mode 0 --dataset_name pRCC > 307_lf05_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 503 --lrf_mystr 50 --data_augmentation_mode 3 --dataset_name CAM16 > 503_lf50_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 605 --lrf_mystr 15 --data_augmentation_mode 0 --dataset_name ROSE > 605_lf15_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 503 --lrf_mystr 05 --data_augmentation_mode 0 --dataset_name ROSE > 503_lf05_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 301 --lrf_mystr 35 --data_augmentation_mode 3 --dataset_name WBC > 301_lf35_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 606 --lrf_mystr 20 --data_augmentation_mode 3 --dataset_name CAM16 > 606_lf20_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 304 --lrf_mystr 10 --data_augmentation_mode 3 --dataset_name CAM16 > 304_lf10_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 401 --lrf_mystr 30 --data_augmentation_mode 3 --dataset_name CAM16 > 401_lf30_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 304 --lrf_mystr 10 --data_augmentation_mode 0 --dataset_name pRCC > 304_lf10_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 504 --lrf_mystr 20 --data_augmentation_mode 0 --dataset_name pRCC > 504_lf20_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 504 --lrf_mystr 15 --data_augmentation_mode 0 --dataset_name ROSE > 504_lf15_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 506 --lrf_mystr 50 --data_augmentation_mode 3 --dataset_name CAM16 > 506_lf50_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 505 --lrf_mystr 25 --data_augmentation_mode 0 --dataset_name ROSE > 505_lf25_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 501 --lrf_mystr 20 --data_augmentation_mode 0 --dataset_name ROSE > 501_lf20_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 301 --lrf_mystr 20 --data_augmentation_mode 3 --dataset_name WBC > 301_lf20_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 502 --lrf_mystr 50 --data_augmentation_mode 3 --dataset_name WBC > 502_lf50_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 305 --lrf_mystr 20 --data_augmentation_mode 3 --dataset_name CAM16 > 305_lf20_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 401 --lrf_mystr 30 --data_augmentation_mode 0 --dataset_name pRCC > 401_lf30_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 409 --lrf_mystr 30 --data_augmentation_mode 0 --dataset_name ROSE > 409_lf30_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 303 --lrf_mystr 50 --data_augmentation_mode 3 --dataset_name CAM16 > 303_lf50_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 505 --lrf_mystr 05 --data_augmentation_mode 0 --dataset_name ROSE > 505_lf05_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 607 --lrf_mystr 10 --data_augmentation_mode 0 --dataset_name ROSE > 607_lf10_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 309 --lrf_mystr 10 --data_augmentation_mode 0 --dataset_name pRCC > 309_lf10_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 302 --lrf_mystr 50 --data_augmentation_mode 0 --dataset_name pRCC > 302_lf50_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 306 --lrf_mystr 35 --data_augmentation_mode 3 --dataset_name CAM16 > 306_lf35_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 304 --lrf_mystr 05 --data_augmentation_mode 0 --dataset_name ROSE > 304_lf05_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 605 --lrf_mystr 10 --data_augmentation_mode 3 --dataset_name WBC > 605_lf10_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 606 --lrf_mystr 30 --data_augmentation_mode 3 --dataset_name WBC > 606_lf30_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 502 --lrf_mystr 20 --data_augmentation_mode 3 --dataset_name WBC > 502_lf20_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 506 --lrf_mystr 15 --data_augmentation_mode 0 --dataset_name pRCC > 506_lf15_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 509 --lrf_mystr 50 --data_augmentation_mode 0 --dataset_name pRCC > 509_lf50_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 609 --lrf_mystr 05 --data_augmentation_mode 3 --dataset_name WBC > 609_lf05_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 303 --lrf_mystr 10 --data_augmentation_mode 0 --dataset_name pRCC > 303_lf10_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 609 --lrf_mystr 10 --data_augmentation_mode 3 --dataset_name WBC > 609_lf10_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 304 --lrf_mystr 15 --data_augmentation_mode 3 --dataset_name CAM16 > 304_lf15_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 503 --lrf_mystr 15 --data_augmentation_mode 0 --dataset_name ROSE > 503_lf15_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 507 --lrf_mystr 20 --data_augmentation_mode 3 --dataset_name CAM16 > 507_lf20_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 606 --lrf_mystr 25 --data_augmentation_mode 3 --dataset_name WBC > 606_lf25_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 402 --lrf_mystr 20 --data_augmentation_mode 3 --dataset_name CAM16 > 402_lf20_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 503 --lrf_mystr 35 --data_augmentation_mode 0 --dataset_name ROSE > 503_lf35_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 303 --lrf_mystr 05 --data_augmentation_mode 0 --dataset_name ROSE > 303_lf05_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 608 --lrf_mystr 30 --data_augmentation_mode 0 --dataset_name pRCC > 608_lf30_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 302 --lrf_mystr 35 --data_augmentation_mode 0 --dataset_name ROSE > 302_lf35_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 306 --lrf_mystr 30 --data_augmentation_mode 3 --dataset_name CAM16 > 306_lf30_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 505 --lrf_mystr 05 --data_augmentation_mode 3 --dataset_name CAM16 > 505_lf05_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 302 --lrf_mystr 40 --data_augmentation_mode 0 --dataset_name pRCC > 302_lf40_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 302 --lrf_mystr 05 --data_augmentation_mode 3 --dataset_name WBC > 302_lf05_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 409 --lrf_mystr 20 --data_augmentation_mode 0 --dataset_name ROSE > 409_lf20_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 302 --lrf_mystr 10 --data_augmentation_mode 3 --dataset_name WBC > 302_lf10_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 306 --lrf_mystr 10 --data_augmentation_mode 3 --dataset_name CAM16 > 306_lf10_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 504 --lrf_mystr 10 --data_augmentation_mode 0 --dataset_name pRCC > 504_lf10_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 304 --lrf_mystr 35 --data_augmentation_mode 0 --dataset_name ROSE > 304_lf35_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 505 --lrf_mystr 40 --data_augmentation_mode 3 --dataset_name WBC > 505_lf40_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 504 --lrf_mystr 40 --data_augmentation_mode 3 --dataset_name WBC > 504_lf40_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 503 --lrf_mystr 25 --data_augmentation_mode 3 --dataset_name WBC > 503_lf25_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 606 --lrf_mystr 10 --data_augmentation_mode 0 --dataset_name ROSE > 606_lf10_ROSE.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 305 --lrf_mystr 40 --data_augmentation_mode 3 --dataset_name CAM16 > 305_lf40_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 305 --lrf_mystr 05 --data_augmentation_mode 3 --dataset_name CAM16 > 305_lf05_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 609 --lrf_mystr 15 --data_augmentation_mode 3 --dataset_name WBC > 609_lf15_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 606 --lrf_mystr 20 --data_augmentation_mode 3 --dataset_name WBC > 606_lf20_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 502 --lrf_mystr 30 --data_augmentation_mode 3 --dataset_name WBC > 502_lf30_WBC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 509 --lrf_mystr 10 --data_augmentation_mode 0 --dataset_name pRCC > 509_lf10_pRCC.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 404 --lrf_mystr 35 --data_augmentation_mode 3 --dataset_name CAM16 > 404_lf35_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 401 --lrf_mystr 15 --data_augmentation_mode 3 --dataset_name CAM16 > 401_lf15_CAM16.sh 2>&1 & +nohup python Experiment_script_helper.py --lr_mystr 505 --lrf_mystr 35 --data_augmentation_mode 0 --dataset_name ROSE > 505_lf35_ROSE.sh 2>&1 & +''' diff --git a/PuzzleTuning/utils/SoftCrossEntropyLoss.py b/PuzzleTuning/utils/SoftCrossEntropyLoss.py new file mode 100644 index 0000000000000000000000000000000000000000..ff72edeab929320600269630e27b6e6e5da54cac --- /dev/null +++ b/PuzzleTuning/utils/SoftCrossEntropyLoss.py @@ -0,0 +1,33 @@ +""" +SoftCrossEntropy loss Script ver: May 17th 19:00 + +update +SoftlabelCrossEntropy loss for soft-label based augmentations +fixme 好像说reduction='sum' 有问题? +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + + +# define SoftlabelCrossEntropy loss for soft-label based augmentations +def SoftCrossEntropy(input, target, reduction='sum'): # reduction='sum' fixme 好像说有问题?查一下warning + log_likelihood = -F.log_softmax(input, dim=1) + batch = input.shape[0] + if reduction == 'average': + loss = torch.sum(torch.mul(log_likelihood, target)) / batch + else: + loss = torch.sum(torch.mul(log_likelihood, target)) + return loss + + +class SoftlabelCrossEntropy(nn.modules.loss._Loss): + __constants__ = ['reduction'] + + def __init__(self, reduction: str = 'sum') -> None: + super(SoftlabelCrossEntropy, self).__init__(reduction) + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + return SoftCrossEntropy(input, target, reduction=self.reduction) diff --git a/PuzzleTuning/utils/check_log_json.py b/PuzzleTuning/utils/check_log_json.py new file mode 100644 index 0000000000000000000000000000000000000000..b7dd1f30eca7ea43357d1ae56338089e01f01ae7 --- /dev/null +++ b/PuzzleTuning/utils/check_log_json.py @@ -0,0 +1,223 @@ +""" +Organize log and output excel script ver: Sep 13th 15:00 +enable_notify +""" + + +import argparse +import json +import os + +try: # 适配不同系统 + from utils.metrics import * +except: + from metrics import * + + +def find_all_files(root, suffix=None): + ''' + 返回特定后缀的所有文件路径列表 + ''' + res = [] + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + return res + + +def read_a_json_log(json_path, record_dir): + if not os.path.exists(record_dir): + os.makedirs(record_dir) + + with open(json_path) as f: + load_dict = json.load(f) + # print(load_dict) + epoch_num = len(load_dict) + try: + cls_list = [cls for cls in load_dict[str(1)]['train']] + test_status = False + except: + cls_list = [cls for cls in load_dict['test']['test']] + test_status = True + else: + pass + cls_num = len(cls_list) + + indicator_list = ['Precision', 'Recall', 'Sensitivity', 'Specificity', 'NPV', 'F1_score'] + indicator_num = len(indicator_list) + + blank_num = cls_num * indicator_num + first_blank_num = blank_num // 2 + + empty_str1 = ' ,' # 对齐Acc + for i in range(0, first_blank_num): + empty_str1 += ' ,' + + empty_str2 = '' + for i in range(0, blank_num): + empty_str2 += ' ,' + + result_csv_name = os.path.split(json_path)[1].split('.')[0] + '.csv' + result_indicators = [os.path.split(json_path)[1].split('.')[0], ] # 第一个位置留给model name + + with open(os.path.join(record_dir, result_csv_name), 'w') as f_log: + if test_status: + # 写头文件1 + f_log.write('Phase:,' + empty_str1 + ' Test\n') + head = 'Epoch:, ' + class_head = 'Acc, ' # 目标 'Acc, '+ 类别* indicator_list + for cls in cls_list: + for indicator in indicator_list: + class_head += cls + '_' + indicator + ', ' + + # 写头文件2 + f_log.write(head + class_head + '\n') # Test + f_log.close() + + else: + # 写头文件1 + f_log.write('Phase:,' + empty_str1 + ' Train' + empty_str2 + ' Val\n') + + head = 'Epoch:, ' + class_head = 'Acc, ' # 目标 'Acc, '+ 类别* indicator_list + for cls in cls_list: + for indicator in indicator_list: + class_head += cls + '_' + indicator + ', ' + + # 写头文件2 + f_log.write(head + class_head + class_head + '\n') # Train val + f_log.close() + + # 初始化最佳 + best_val_acc = 0.0 + + for epoch in range(1, epoch_num + 1): + if test_status: + epoch = 'test' + epoch_indicators = [epoch, ] # 第一个位置留给epoch + + for phase in ['train', 'val']: + if test_status: + phase = 'test' + + sum_tp = 0.0 + + phase_indicators = [0.0, ] # 第一个位置留给ACC + + for cls in cls_list: + log = load_dict[str(epoch)][phase][cls] + tp = log['tp'] + tn = log['tn'] + fp = log['fp'] + fn = log['fn'] + + sum_tp += tp + + Precision = compute_precision(tp, fp) + Recall = compute_recall(tp, fn) + + Sensitivity = compute_sensitivity(tp, fn) + Specificity = compute_specificity(tn, fp) + + NPV = compute_NPV(tn, fn) + F1_score = compute_f1_score(tp, tn, fp, fn) + + cls_indicators = [Precision, Recall, Sensitivity, Specificity, NPV, F1_score] + phase_indicators.extend(cls_indicators) + + Acc = 100 * (sum_tp / float(tp + tn + fn + fp)) # 直接取最后一个的tp tn fn fp 算总数就行 + phase_indicators[0] = Acc + + epoch_indicators.extend(phase_indicators) + + if Acc >= best_val_acc and phase == 'val': + best_val_acc = Acc + best_epoch_indicators = epoch_indicators + + elif test_status: + with open(os.path.join(record_dir, result_csv_name), 'a') as f_log: + for i in epoch_indicators: + f_log.write(str(i) + ', ') + f_log.write('\n') + f_log.close() + result_indicators.extend(epoch_indicators) + return result_indicators # 结束 返回test的log行 + else: + pass + + # epoch_indicators + with open(os.path.join(record_dir, result_csv_name), 'a') as f_log: + for i in epoch_indicators: + f_log.write(str(i) + ', ') + f_log.write('\n') + + with open(os.path.join(record_dir, result_csv_name), 'a') as f_log: + f_log.write('\n') + f_log.write('\n') + # 写头文件1 + f_log.write('Phase:,' + empty_str1 + ' Train' + empty_str2 + ' Val\n') + # 写头文件2 + f_log.write('Best Epoch:, ' + class_head + class_head + '\n') # Train val + + try: + for i in best_epoch_indicators: + f_log.write(str(i) + ', ') + f_log.close() + result_indicators.extend(best_epoch_indicators) + return result_indicators # 结束 返回best epoch行 + except: + print('No best_epoch_indicators') + return result_indicators # 结束 + + +def read_all_logs(logs_path, record_dir): + if not os.path.exists(record_dir): + os.makedirs(record_dir) + + res = find_all_files(logs_path, suffix='.json') + + result_csv_name = os.path.split(logs_path)[1] + '.csv' + + with open(os.path.join(record_dir, result_csv_name), 'w') as f_log: + for json_path in res: + result_indicators = read_a_json_log(json_path, record_dir) # best_epoch_indicators of a model json log + + for i in result_indicators: + f_log.write(str(i) + ', ') + f_log.write('\n') + f_log.close() + + print('record_dir:',record_dir) + + +def main(args): + ONE_LOG = args.ONE_LOG + draw_root = args.draw_root + record_dir = args.record_dir + + if ONE_LOG: + read_a_json_log(draw_root, record_dir) + else: + read_all_logs(draw_root, record_dir) + + +def get_args_parser(): + parser = argparse.ArgumentParser(description='Log checker') + + parser.add_argument('--ONE_LOG', action='store_true', help='check only one LOG') + + parser.add_argument('--draw_root', default=r'../../../../Downloads/runs', + help='path of the drawn and saved tensorboard output') + + parser.add_argument('--record_dir', default=r'../../../../Downloads/runs/CSV_logs', + help='path to save csv log output') + + return parser + + +if __name__ == '__main__': + parser = get_args_parser() + args = parser.parse_args() + main(args) diff --git a/PuzzleTuning/utils/check_tensorboard.py b/PuzzleTuning/utils/check_tensorboard.py new file mode 100644 index 0000000000000000000000000000000000000000..0290e3d958248ec4dbbdcc598ef27aa9a30160ad --- /dev/null +++ b/PuzzleTuning/utils/check_tensorboard.py @@ -0,0 +1,56 @@ +# 读取tf events画ACC-Loss +from tensorboard.backend.event_processing import event_accumulator # 导入tensorboard的事件解析器 + +import os +import matplotlib +import matplotlib.pyplot as plt + + +def find_all_files_startwith(root, suffix=None): + """ + 返回特定前缀的所有文件路径列表 + """ + res = [] + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.startswith(suffix): + continue + res.append(os.path.join(root, f)) + return res + + +def ACC_loss(PATH, out_file_path): + fig = plt.figure(figsize=(6, 4)) + ax1 = fig.add_subplot(111) + + runs_all = find_all_files_startwith(PATH, suffix='events') + print(runs_all) + + for runs_path in runs_all: + model_idx = os.path.split(os.path.split(runs_path)[0])[1] + + ea = event_accumulator.EventAccumulator(runs_path) # 初始化EventAccumulator对象 + ea.Reload() # 这一步是必须的,将事件的内容都导进去 + # print(ea.scalars.Keys()) # 检查保存了哪些记录scalars + + train_ACC = ea.scalars.Items("train_ACC") + train_loss = ea.scalars.Items("train_loss") # 读取train_loss + ''' + print([(i.step, i.value) for i in train_ACC]) + for i, j in zip(train_ACC, train_loss): + print((i.value, j.value)) + ''' + ax1.plot([i.value for i in train_loss], [i.value for i in train_ACC], label=model_idx) + + plt.legend(loc='lower right') + ax1.set_xlabel("Loss") + ax1.set_ylabel("Acc") + plt.show() + plt.savefig(out_file_path, dpi=1000) + + +if __name__ == '__main__': + matplotlib.use('Agg') + PATH = './MIL-SI/Archive/log/abalation' + out_file_path = './patch_size_abalation_loss-acc.jpg' + ACC_loss(PATH, out_file_path) diff --git a/PuzzleTuning/utils/data_augmentation.py b/PuzzleTuning/utils/data_augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..8632a767e823529412426d7578ecaf3bee41d747 --- /dev/null +++ b/PuzzleTuning/utils/data_augmentation.py @@ -0,0 +1,84 @@ +""" +data_augmentation Script ver: Sep 1st 20:30 + +dataset structure: ImageNet +image folder dataset is used. +""" + +from torchvision import transforms + + +def data_augmentation(data_augmentation_mode=0, edge_size=384): + if data_augmentation_mode == 0: # ROSE + MARS + data_transforms = { + 'train': transforms.Compose([ + transforms.RandomRotation((0, 180)), + transforms.RandomHorizontalFlip(), + transforms.RandomVerticalFlip(), + transforms.CenterCrop(700), # center area for classification + transforms.Resize([edge_size, edge_size]), + transforms.ColorJitter(brightness=0.15, contrast=0.3, saturation=0.3, hue=0.06), + # HSL shift operation + transforms.ToTensor() + ]), + 'val': transforms.Compose([ + transforms.CenterCrop(700), + transforms.Resize([edge_size, edge_size]), + transforms.ToTensor() + ]), + } + + elif data_augmentation_mode == 1: # Cervical + data_transforms = { + 'train': transforms.Compose([ + transforms.Resize([edge_size, edge_size]), + transforms.RandomVerticalFlip(), + transforms.RandomHorizontalFlip(), + transforms.ColorJitter(brightness=0.15, contrast=0.3, saturation=0.3, hue=0.06), + # HSL shift operation + transforms.ToTensor() + ]), + 'val': transforms.Compose([ + transforms.Resize([edge_size, edge_size]), + transforms.ToTensor() + ]), + } + + elif data_augmentation_mode == 2: # warwick + data_transforms = { + 'train': transforms.Compose([ + transforms.RandomRotation((0, 180)), + transforms.RandomHorizontalFlip(), + transforms.RandomVerticalFlip(), + transforms.CenterCrop(360), # center area for classification + transforms.Resize([edge_size, edge_size]), + transforms.ColorJitter(brightness=0.15, contrast=0.3, saturation=0.3, hue=0.06), + # HSL shift operation + transforms.ToTensor() + ]), + 'val': transforms.Compose([ + transforms.CenterCrop(360), + transforms.Resize([edge_size, edge_size]), + transforms.ToTensor() + ]), + } + + elif data_augmentation_mode == 3: # for the squre input: just resize + data_transforms = { + 'train': transforms.Compose([ + transforms.RandomHorizontalFlip(), + transforms.RandomVerticalFlip(), + transforms.Resize([edge_size, edge_size]), + transforms.ColorJitter(brightness=0.15, contrast=0.3, saturation=0.3, hue=0.06), + # HSL shift operation + transforms.ToTensor() + ]), + 'val': transforms.Compose([ + transforms.Resize([edge_size, edge_size]), + transforms.ToTensor() + ]), + } + else: + print('no legal data augmentation is selected') + return -1 + return data_transforms diff --git a/PuzzleTuning/utils/dual_augmentation.py b/PuzzleTuning/utils/dual_augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..9b802f604a39357ee7af29af934a35b37dfaf111 --- /dev/null +++ b/PuzzleTuning/utils/dual_augmentation.py @@ -0,0 +1,242 @@ +""" +dual augmentation on both images and their masks Script ver: Apr 10th 11:20 + + +""" +import random +import numpy as np +import cv2 +from PIL import Image +from torchvision import transforms +from utils.tools import to_2tuple + + +class DualCompose: # fit pytorch transform + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, image, mask=None): + # process the cv2 transformation first + for t in self.transforms: + image, mask = t(image, mask) + # NOTICE 转回图片 值总和还变成了cv2 numpy的1/255 + + # Then, Transform cv2 BGR image to PIL RGB image + + # BGR -> RGB channel + b, g, r = cv2.split(image) + image = cv2.merge([r, g, b]) + b, g, r = cv2.split(mask) + mask = cv2.merge([r, g, b]) + # Image.fromarray make the 0-255 to PIL 0-1 range values + return Image.fromarray(np.uint8(image)), Image.fromarray(np.uint8(mask)) + + +class DualImageTransform: + # Transform cv2 BGR image to PIL RGB image + def __init__(self): + pass + + def __call__(self, image, mask=None): + # BGR -> RGB channel + b, g, r = cv2.split(image) + image = cv2.merge([r, g, b]) + b, g, r = cv2.split(mask) + mask = cv2.merge([r, g, b]) + # Image.fromarray make the 0-255 to PIL 0-1 range values + return Image.fromarray(np.uint8(image)), Image.fromarray(np.uint8(mask)) + + +class Dual_RandomHorizontalFlip: + """ + Random horizontal flip. + image shape: (height, width, channels) + mask shape: (height, width) + possibility: possibility for flip + """ + + def __init__(self, possibility=0.5): + assert isinstance(possibility, (int, float)) + self.possibility = possibility + + def __call__(self, image, mask): + if random.random() <= self.possibility: + image = np.flip(image, axis=1) + mask = np.flip(mask, axis=1) + + return image, mask + + +class Dual_RandomVerticalFlip: + """ + Random vertical flip. + image shape: (height, width, channels) + mask shape: (height, width) + possibility: possibility for flip + """ + + def __init__(self, possibility=0.5): + assert isinstance(possibility, (int, float)) + self.possibility = possibility + + def __call__(self, image, mask): + if random.random() <= self.possibility: + image = np.flip(image, axis=0) + mask = np.flip(mask, axis=0) + + return image, mask + + +class Dual_Rotate: + """ + Random rotation. + image shape: (height, width, channels) + mask shape: (height, width) + possibility: possibility for rotate + range: range of rotation angles + """ + + def __init__(self, possibility=0.5, range=20): + self.possibility = possibility + self.range = range + + def __call__(self, image, mask): + # 这里cv2读到的是反的,因此这里是height, width而不是width,height,图片input不是正方形时会有严重后果 + height, width = image.shape[:2] + + if random.random() <= self.possibility: + angle = np.random.randint(0, self.range) + + center = (width // 2, height // 2) + # 得到旋转矩阵,第一个参数为旋转中心,第二个参数为旋转角度,第三个参数为旋转之前原图像缩放比例 + M = cv2.getRotationMatrix2D(center, -angle, 1) + # 进行仿射变换,第一个参数图像,第二个参数是旋转矩阵,第三个参数是变换之后的图像大小 + image = cv2.warpAffine(image, M, (width, height)) + mask = cv2.warpAffine(mask.astype(np.uint8), M, (width, height)) + + return image.astype(np.uint8), mask.astype(np.int) + + +def Four_step_dual_augmentation(data_augmentation_mode=0, edge_size=384): + """ + Get data augmentation methods + + Dual_transform : Transform CV2 images and their mask by Rotate, RandomHorizontalFlip, etc. + DualImage : Transform CV2 images and their mask to PIL images + train_domain_transform : transforms.ColorJitter on PIL images + transform: PIL crop, resize and to Tensor + + USAGE: + + IN Train: + image, mask = self.Dual_transform(image, mask) + # image color jitter shifting + image = self.train_domain_transform(image) + # crop + resize + image = self.transform(image) + + IN Val $ Test: + + # 0/255 mask -> binary mask + image, mask = self.DualImage(image, mask) + # crop + resize + image = self.transform(image) + """ + + edge_size = to_2tuple(edge_size) + + if data_augmentation_mode == 0: # ROSE + MARS + # apply the on-time synchornized transform on image and mask togather + Dual_transform = DualCompose([ + Dual_Rotate(possibility=0.8, range=180), + Dual_RandomHorizontalFlip(), + Dual_RandomVerticalFlip(), + ]) + # val & test use DualImage to convert PIL Image + DualImage = DualImageTransform() + + # ColorJitter for image only + train_domain_transform = transforms.Compose([ + # HSL shift operation + transforms.ColorJitter(brightness=0.15, contrast=0.3, saturation=0.3, hue=0.06), + ]) + + # lastly, the synchornized separate transform + transform = transforms.Compose([ + transforms.CenterCrop(700), # center area for classification + transforms.Resize(edge_size), + transforms.ToTensor(), # hwc -> chw tensor + ]) + + elif data_augmentation_mode == 1: # Cervical + # apply the on-time synchornized transform on image and mask togather + Dual_transform = DualCompose([ + Dual_Rotate(possibility=0.8, range=180), + Dual_RandomHorizontalFlip(), + Dual_RandomVerticalFlip(), + ]) + # val & test use DualImage to convert PIL Image + DualImage = DualImageTransform() + + # ColorJitter for image only + train_domain_transform = transforms.Compose([ + # HSL shift operation + transforms.ColorJitter(brightness=0.15, contrast=0.3, saturation=0.3, hue=0.06), + ]) + + # lastly, the synchornized separate transform + transform = transforms.Compose([ + transforms.Resize(edge_size), + transforms.ToTensor(), # hwc -> chw tensor + ]) + + elif data_augmentation_mode == 2: # + # apply the on-time synchornized transform on image and mask togather + Dual_transform = DualCompose([ + Dual_Rotate(possibility=0.8, range=180), + Dual_RandomHorizontalFlip(), + Dual_RandomVerticalFlip(), + ]) + # val & test use DualImage to convert PIL Image + DualImage = DualImageTransform() + + # ColorJitter for image only + train_domain_transform = transforms.Compose([ + # HSL shift operation + transforms.ColorJitter(brightness=0.15, contrast=0.3, saturation=0.3, hue=0.06), + ]) + + # lastly, the synchornized separate transform + transform = transforms.Compose([ + transforms.CenterCrop(360), # center area for classification + transforms.Resize(edge_size), + transforms.ToTensor(), # hwc -> chw tensor + ]) + + elif data_augmentation_mode == 3: # for the squre input: just resize + # apply the on-time synchornized transform on image and mask togather + Dual_transform = DualCompose([ + # Dual_Rotate(possibility=0.8, range=180), + Dual_RandomHorizontalFlip(), + Dual_RandomVerticalFlip(), + ]) + # val & test use DualImage to convert PIL Image + DualImage = DualImageTransform() + + # ColorJitter for image only + train_domain_transform = transforms.Compose([ + # HSL shift operation + transforms.ColorJitter(brightness=0.15, contrast=0.3, saturation=0.3, hue=0.06), + ]) + + # lastly, the synchornized separate transform + transform = transforms.Compose([ + transforms.Resize(edge_size), + transforms.ToTensor(), # hwc -> chw tensor + ]) + + else: + print('no legal data augmentation is selected') + return -1 + + return Dual_transform, DualImage, train_domain_transform, transform diff --git a/PuzzleTuning/utils/fmix.py b/PuzzleTuning/utils/fmix.py new file mode 100644 index 0000000000000000000000000000000000000000..e006c02e8e0c785e7904ea244d0d8c7102c86105 --- /dev/null +++ b/PuzzleTuning/utils/fmix.py @@ -0,0 +1,194 @@ +""" +from official release of ... +Script ver: July 9th 15:20 + +""" + +import math +import random + +import numpy as np +from scipy.stats import beta + + +def fftfreqnd(h, w=None, z=None): + """ Get bin values for discrete fourier transform of size (h, w, z) + + :param h: Required, first dimension size + :param w: Optional, second dimension size + :param z: Optional, third dimension size + """ + fz = fx = 0 + fy = np.fft.fftfreq(h) + + if w is not None: + fy = np.expand_dims(fy, -1) + + if w % 2 == 1: + fx = np.fft.fftfreq(w)[: w // 2 + 2] + else: + fx = np.fft.fftfreq(w)[: w // 2 + 1] + + if z is not None: + fy = np.expand_dims(fy, -1) + if z % 2 == 1: + fz = np.fft.fftfreq(z)[:, None] + else: + fz = np.fft.fftfreq(z)[:, None] + + return np.sqrt(fx * fx + fy * fy + fz * fz) + + +def get_spectrum(freqs, decay_power, ch, h, w=0, z=0): + """ Samples a fourier image with given size and frequencies decayed by decay power + + :param freqs: Bin values for the discrete fourier transform + :param decay_power: Decay power for frequency decay prop 1/f**d + :param ch: Number of channels for the resulting mask + :param h: Required, first dimension size + :param w: Optional, second dimension size + :param z: Optional, third dimension size + """ + scale = np.ones(1) / (np.maximum(freqs, np.array([1. / max(w, h, z)])) ** decay_power) + + param_size = [ch] + list(freqs.shape) + [2] + param = np.random.randn(*param_size) + + scale = np.expand_dims(scale, -1)[None, :] + + return scale * param + + +def make_low_freq_image(decay, shape, ch=1): + """ Sample a low frequency image from fourier space + + :param decay_power: Decay power for frequency decay prop 1/f**d + :param shape: Shape of desired mask, list up to 3 dims + :param ch: Number of channels for desired mask + """ + freqs = fftfreqnd(*shape) + spectrum = get_spectrum(freqs, decay, ch, *shape)#.reshape((1, *shape[:-1], -1)) + spectrum = spectrum[:, 0] + 1j * spectrum[:, 1] + mask = np.real(np.fft.irfftn(spectrum, shape)) + + if len(shape) == 1: + mask = mask[:1, :shape[0]] + if len(shape) == 2: + mask = mask[:1, :shape[0], :shape[1]] + if len(shape) == 3: + mask = mask[:1, :shape[0], :shape[1], :shape[2]] + + mask = mask + mask = (mask - mask.min()) + mask = mask / mask.max() + return mask + + +def sample_lam(alpha, reformulate=False): + """ Sample a lambda from symmetric beta distribution with given alpha + + :param alpha: Alpha value for beta distribution + :param reformulate: If True, uses the reformulation of [1]. + """ + if reformulate: + lam = beta.rvs(alpha+1, alpha) + else: + lam = beta.rvs(alpha, alpha) + + return lam + + +def binarise_mask(mask, lam, in_shape, max_soft=0.0): + """ Binarises a given low frequency image such that it has mean lambda. + + :param mask: Low frequency image, usually the result of `make_low_freq_image` + :param lam: Mean value of final mask + :param in_shape: Shape of inputs + :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask. + :return: + """ + idx = mask.reshape(-1).argsort()[::-1] + mask = mask.reshape(-1) + num = math.ceil(lam * mask.size) if random.random() > 0.5 else math.floor(lam * mask.size) + + eff_soft = max_soft + if max_soft > lam or max_soft > (1-lam): + eff_soft = min(lam, 1-lam) + + soft = int(mask.size * eff_soft) + num_low = num - soft + num_high = num + soft + + mask[idx[:num_high]] = 1 + mask[idx[num_low:]] = 0 + mask[idx[num_low:num_high]] = np.linspace(1, 0, (num_high - num_low)) + + mask = mask.reshape((1, *in_shape)) + return mask + + +def sample_mask(alpha, decay_power, shape, max_soft=0.0, reformulate=False): + """ Samples a mean lambda from beta distribution parametrised by alpha, creates a low frequency image and binarises + it based on this lambda + + :param alpha: Alpha value for beta distribution from which to sample mean of mask + :param decay_power: Decay power for frequency decay prop 1/f**d + :param shape: Shape of desired mask, list up to 3 dims + :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask. + :param reformulate: If True, uses the reformulation of [1]. + """ + if isinstance(shape, int): + shape = (shape,) + + # Choose lambda + lam = sample_lam(alpha, reformulate) + + # Make mask, get mean / std + mask = make_low_freq_image(decay_power, shape) + mask = binarise_mask(mask, lam, shape, max_soft) + + return lam, mask + + +def sample_and_apply(x, alpha, decay_power, shape, max_soft=0.0, reformulate=False): + """ + + :param x: Image batch on which to apply fmix of shape [b, c, shape*] + :param alpha: Alpha value for beta distribution from which to sample mean of mask + :param decay_power: Decay power for frequency decay prop 1/f**d + :param shape: Shape of desired mask, list up to 3 dims + :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask. + :param reformulate: If True, uses the reformulation of [1]. + :return: mixed input, permutation indices, lambda value of mix, + """ + lam, mask = sample_mask(alpha, decay_power, shape, max_soft, reformulate) + index = np.random.permutation(x.shape[0]) + + x1, x2 = x * mask, x[index] * (1-mask) + return x1+x2, index, lam + + +class FMixBase: + r""" FMix augmentation + + Args: + decay_power (float): Decay power for frequency decay prop 1/f**d + alpha (float): Alpha value for beta distribution from which to sample mean of mask + size ([int] | [int, int] | [int, int, int]): Shape of desired mask, list up to 3 dims + max_soft (float): Softening value between 0 and 0.5 which smooths hard edges in the mask. + reformulate (bool): If True, uses the reformulation of [1]. + """ + + def __init__(self, decay_power=3, alpha=1, size=(32, 32), max_soft=0.0, reformulate=False): + super().__init__() + self.decay_power = decay_power + self.reformulate = reformulate + self.size = size + self.alpha = alpha + self.max_soft = max_soft + self.index = None + self.lam = None + + def __call__(self, inputs, labels, alpha=2, beta=2, act=True): + raise NotImplementedError + diff --git a/PuzzleTuning/utils/metrics.py b/PuzzleTuning/utils/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..7e2a892e9a0df8dd4f500e8f05fe3a111a1d7ecd --- /dev/null +++ b/PuzzleTuning/utils/metrics.py @@ -0,0 +1,68 @@ +import numpy as np + + +def compute_accuracy(tp, tn, fn, fp): # only fit 2 cls condition + """ + Accuracy = (TP + TN) / (FP + FN + TP + TN) + """ + if tp + tn + fn + fp == 0: + return 0.0 + return ((tp + tn) * 100) / float(tp + tn + fn + fp) + + +def compute_specificity(tn, fp): + """ + Precision = TN / (FN + TP) + """ + if tn + fp == 0: + return 0.0 + return (tn * 100) / float(tn + fp) + + +def compute_sensitivity(tp, fn): # equal to recall + """ + Recall = TP / (FN + TP) + """ + if tp + fn == 0: + return 0.0 + return (tp * 100) / float(tp + fn) + + +def compute_precision(tp, fp): # equal to Positive Predictive Value(PPV) + """ + Precision = TP / (FP + TP) + """ + if tp + fp == 0: + return 0.0 + return (tp * 100) / float(tp + fp) + + +def compute_recall(tp, fn): # equal to Sensitivity + """ + Recall = TP / (FN + TP) + """ + if tp + fn == 0: + return 0.0 + return (tp * 100) / float(tp + fn) + + +def compute_f1_score(tp, tn, fp, fn): + # calculates the F1 score + precision = compute_precision(tp, fp) / 100 + recall = compute_recall(tp, fn) / 100 + + if precision + recall == 0: + return 0.0 + + f1_score = (2 * precision * recall) / (precision + recall) + return f1_score * 100 + + +def compute_NPV(tn, fn): # Negative Predictive Value + """ + Negative Predictive Value = tn / (tn + fn) + """ + if tn + fn == 0: + return 0.0 + return (tn * 100) / float(tn + fn) + diff --git a/PuzzleTuning/utils/online_augmentations.py b/PuzzleTuning/utils/online_augmentations.py new file mode 100644 index 0000000000000000000000000000000000000000..960e6756bd1e2711a3b63b72d3643f0198570630 --- /dev/null +++ b/PuzzleTuning/utils/online_augmentations.py @@ -0,0 +1,635 @@ +""" +Online Augmentations May 23rd 2023 21:30 +ref: +CutOut, Mixup, CutMix based on +https://blog.csdn.net/cp1314971/article/details/106612060 +""" +import cv2 +import torch +import numpy as np +import torch.nn.functional as F +from scipy.special import perm +from torchvision.transforms import Resize +from torchvision.transforms import ToPILImage, ToTensor + +from utils.visual_usage import patchify, unpatchify +from utils.fmix import sample_mask, FMixBase # Fmix + + +# generate random bounding box +def rand_bbox(size, lam): + W = size[2] + H = size[3] + cut_rat = np.sqrt(1. - lam) + cut_w = np.int64(W * cut_rat) + cut_h = np.int64(H * cut_rat) + + # uniform + cx = np.random.randint(W) + cy = np.random.randint(H) + + bbx1 = np.clip(cx - cut_w // 2, 0, W) + bby1 = np.clip(cy - cut_h // 2, 0, H) + bbx2 = np.clip(cx + cut_w // 2, 0, W) + bby2 = np.clip(cy + cut_h // 2, 0, H) + + return bbx1, bby1, bbx2, bby2 + + +def saliency_bbox(img, lam): + size = img.size() + W = size[1] + H = size[2] + cut_rat = np.sqrt(1. - lam) + cut_w = np.int(W * cut_rat) + cut_h = np.int(H * cut_rat) + + # initialize OpenCV's static fine grained saliency detector and + # compute the saliency map + temp_img = img.cpu().numpy().transpose(1, 2, 0) + saliency = cv2.saliency.StaticSaliencyFineGrained_create() + (success, saliencyMap) = saliency.computeSaliency(temp_img) + saliencyMap = (saliencyMap * 255).astype("uint8") + + maximum_indices = np.unravel_index(np.argmax(saliencyMap, axis=None), saliencyMap.shape) + x = maximum_indices[0] + y = maximum_indices[1] + + bbx1 = np.clip(x - cut_w // 2, 0, W) + bby1 = np.clip(y - cut_h // 2, 0, H) + bbx2 = np.clip(x + cut_w // 2, 0, W) + bby2 = np.clip(y + cut_h // 2, 0, H) + + return bbx1, bby1, bbx2, bby2 + + +# augmentation methods +class Cutout(object): + def __init__(self, alpha=2, shuffle_p=1.0, class_num=2, batch_size=4, device='cpu'): + """ + Cutout augmentation arXiv:1708.04552 + :param alpha: alpha + :param shuffle_p: chance of trigger augmentation + :param class_num: number of classification categories + :param batch_size: batch_size of training + :param device: CUDA or CPU + """ + self.alpha = alpha + self.class_num = class_num + self.batch_size = batch_size + self.p = shuffle_p + self.device = torch.device(device) + + def __call__(self, inputs, labels, act=True): + labels = torch.eye(self.class_num).to(self.device)[labels, :] # one-hot hard label + ori_inputs = inputs.clone().to(self.device) # duplicate inputs for ori inputs + cutout_inputs = inputs.clone().to(self.device) # duplicate inputs for outputs + lam_list = [] # a list to record operating ratio + + for i in range(self.batch_size): + + if np.random.randint(0, 101) > 100 * self.p or (not act): + # trigger the augmentation operation + lam_list.append(-1) + continue + + lam = np.random.beta(self.alpha, self.alpha) + bbx1, bby1, bbx2, bby2 = rand_bbox(ori_inputs.size(), lam) # get random bbox + + cutout_inputs[i, :, bbx1:bbx2, bby1:bby2] = 0 + + # update the ratio of (area of ori_image on new masked image) for soft-label + lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (ori_inputs.size()[2] * ori_inputs.size()[3])) + lam_list.append(lam) + + long_label = labels.argmax(dim=1) + + # NOTICE cutout use long label and ori_crossentropy instead of soft-label and soft-label_crossentropy + return cutout_inputs, long_label, long_label + + +class CutMix(object): + def __init__(self, alpha=2, shuffle_p=1.0, class_num=2, batch_size=4, device='cpu'): + """ + CutMix augmentation arXiv:1905.04899 + :param alpha: alpha + :param shuffle_p: chance of trigger augmentation + :param class_num: number of classification categories + :param batch_size: batch_size of training + :param device: CUDA or CPU + """ + self.alpha = alpha + self.class_num = class_num + self.batch_size = batch_size + + # calibrate the trigger chance of p, new ratio is the change of operation occur in each batch + self.p = shuffle_p * (perm(self.batch_size, self.batch_size) + / (perm(self.batch_size, self.batch_size) - + perm(self.batch_size - 1, self.batch_size - 1))) + self.device = torch.device(device) + + def __call__(self, inputs, labels, act=True): + + labels = torch.eye(self.class_num).to(self.device)[labels, :] # one-hot hard label + ori_inputs = inputs.clone().to(self.device) # duplicate inputs for ori inputs + cutmix_inputs = inputs.clone().to(self.device) # duplicate inputs for outputs + lam_list = [] # a list to record operating ratio + indices = torch.randperm(self.batch_size, device=self.device) # shuffle indices + shuffled_inputs = inputs[indices].to(self.device) + shuffled_labels = labels[indices].to(self.device) + + for i in range(self.batch_size): + + if np.random.randint(0, 101) > 100 * self.p or (not act): + # trigger the augmentation operation + lam_list.append(-1) + continue + + lam = np.random.beta(self.alpha, self.alpha) + bbx1, bby1, bbx2, bby2 = rand_bbox(ori_inputs.size(), lam) # get random bbox + + cutmix_inputs[i, :, bbx1:bbx2, bby1:bby2] = \ + shuffled_inputs[i, :, bbx1:bbx2, bby1:bby2] + + # update the ratio of (area of ori_image on new image) for soft-label + lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (ori_inputs.size()[2] * ori_inputs.size()[3])) + lam_list.append(lam) + labels[i] = labels[i] * lam + shuffled_labels[i] * (1 - lam) + + long_label = labels.argmax(dim=1) + return cutmix_inputs, labels, long_label + + +class Mixup(object): + def __init__(self, alpha=2, shuffle_p=1.0, class_num=2, batch_size=4, device='cpu'): + """ + Mixup augmentation arXiv:1710.09412 + :param alpha: alpha + :param shuffle_p: chance of trigger augmentation + :param class_num: number of classification categories + :param batch_size: batch_size of training + :param device: CUDA or CPU + """ + self.alpha = alpha + self.class_num = class_num + self.batch_size = batch_size + # calibrate the trigger chance of p, new ratio is the change of operation occur in each batch + self.p = shuffle_p * (perm(self.batch_size, self.batch_size) + / (perm(self.batch_size, self.batch_size) - + perm(self.batch_size - 1, self.batch_size - 1))) + self.device = torch.device(device) + + def __call__(self, inputs, labels, act=True): + labels = torch.eye(self.class_num).to(self.device)[labels, :] # one-hot hard label + ori_inputs = inputs.clone().to(self.device) # duplicate inputs for ori inputs + mixup_inputs = inputs.clone().to(self.device) # duplicate inputs for outputs + lam_list = [] # a list to record operating ratio + indices = torch.randperm(self.batch_size, device=self.device) # shuffle indices + shuffled_inputs = inputs[indices].to(self.device) + shuffled_labels = labels[indices].to(self.device) + + for i in range(self.batch_size): + if np.random.randint(0, 101) > 100 * self.p or (not act): + # trigger the augmentation operation + lam_list.append(-1) + continue + + lam = np.random.beta(self.alpha, self.alpha) + lam_list.append(lam) + mixup_inputs[i] = ori_inputs[i] * lam + shuffled_inputs[i] * (1 - lam) + labels[i] = labels[i] * lam + shuffled_labels[i] * (1 - lam) + + long_label = labels.argmax(dim=1) + return mixup_inputs, labels, long_label + + +class SaliencyMix(object): + def __init__(self, alpha=1, shuffle_p=1.0, class_num=2, batch_size=4, device='cpu'): + """ + SaliencyMix augmentation arXiv:2006.01791 + :param alpha: alpha + :param shuffle_p: chance of trigger augmentation + :param class_num: number of classification categories + :param batch_size: batch_size of training + :param device: CUDA or CPU + """ + # ori batch_size=128 + self.alpha = alpha + self.class_num = class_num + self.batch_size = batch_size + # calibrate the trigger chance of p, new ratio is the change of operation occur in each batch + self.p = shuffle_p + self.device = torch.device(device) + + def __call__(self, inputs, labels, act=True): + labels = torch.eye(self.class_num).to(self.device)[labels, :] # one-hot hard label + ori_inputs = inputs.clone().to(self.device) # duplicate inputs for ori inputs + saliencymix_inputs = inputs.clone().to(self.device) # duplicate inputs for outputs + lam_list = [] # a list to record operating ratio + indices = torch.randperm(self.batch_size, device=self.device) # shuffle indices + shuffled_inputs = inputs[indices].to(self.device) + shuffled_labels = labels[indices].to(self.device) + + for i in range(self.batch_size): + if np.random.randint(0, 101) > 100 * self.p or (not act) or self.alpha <= 0: + # trigger the augmentation operation + lam_list.append(-1) + continue + + lam = np.random.beta(self.alpha, self.alpha) + bbx1, bby1, bbx2, bby2 = saliency_bbox(shuffled_inputs[i], lam) # get random bbox + + saliencymix_inputs[i, :, bbx1:bbx2, bby1:bby2] = \ + shuffled_inputs[i, :, bbx1:bbx2, bby1:bby2] + + # update the ratio of (area of ori_image on new image) for soft-label + lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (ori_inputs.size()[2] * ori_inputs.size()[3])) + lam_list.append(lam) + labels[i] = labels[i] * lam + shuffled_labels[i] * (1 - lam) + + long_label = labels.argmax(dim=1) + return saliencymix_inputs, labels, long_label + + +class ResizeMix(object): + def __init__(self, shuffle_p=1.0, class_num=2, batch_size=4, device='cpu'): + """ + ResizeMix augmentation arXiv:2012.11101 + :param shuffle_p: chance of trigger augmentation + :param class_num: number of classification categories + :param batch_size: batch_size of training + :param device: CUDA or CPU + """ + # ori batch_size=512 + self.class_num = class_num + self.batch_size = batch_size + # calibrate the trigger chance of p, new ratio is the change of operation occur in each batch + self.p = shuffle_p + self.device = torch.device(device) + + def __call__(self, inputs, labels, alpha=0.1, beta=0.8, act=True): + labels = torch.eye(self.class_num).to(self.device)[labels, :] # one-hot hard label + ori_inputs = inputs.clone().to(self.device) # duplicate inputs for ori inputs + resizemix_inputs = inputs.clone().to(self.device) # duplicate inputs for outputs + lam_list = [] # a list to record operating ratio + indices = torch.randperm(self.batch_size, device=self.device) # shuffle indices + shuffled_inputs = inputs[indices].to(self.device) + shuffled_labels = labels[indices].to(self.device) + + for i in range(self.batch_size): + if np.random.randint(0, 101) > 100 * self.p or (not act): + # trigger the augmentation operation + lam_list.append(-1) + continue + + lam = np.random.uniform(alpha, beta) + # lam = 1 - lam + bbx1, bby1, bbx2, bby2 = rand_bbox(ori_inputs.size(), lam) # get random bbox + + # resizer by torchvision + torch_resize = Resize([bbx2 - bbx1, bby2 - bby1]) + + # Tensor -> PIL -> resize -> Tensor + re_pil_image = torch_resize(ToPILImage()(shuffled_inputs[i])) + resizemix_inputs[i, :, bbx1:bbx2, bby1:bby2] = ToTensor()(re_pil_image) + + # update the ratio of (area of ori_image on new image) for soft-label + lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (ori_inputs.size()[2] * ori_inputs.size()[3])) + lam_list.append(lam) + labels[i] = labels[i] * lam + shuffled_labels[i] * (1 - lam) + + long_label = labels.argmax(dim=1) + return resizemix_inputs, labels, long_label + + +class FMix(FMixBase): + + def __init__(self, shuffle_p=1.0, class_num=2, batch_size=4, decay_power=3, alpha=1, size=(32, 32), + max_soft=0.0, reformulate=False, device='cpu'): + """ + FMix augmentation arXiv:2002.12047 + :param shuffle_p: chance of trigger augmentation + :param class_num: number of classification categories + :param batch_size: batch_size of training + + :param decay_power: decay_power + :param alpha: alpha + :param size: size of patch + :param max_soft: max_soft + :param reformulate: reformulate + + :param device: CUDA or CPU + """ + # ori batch_size=128 + super().__init__(decay_power, alpha, size, max_soft, reformulate) + self.class_num = class_num + self.batch_size = batch_size + self.p = shuffle_p + self.device = torch.device(device) + + def __call__(self, inputs, labels, alpha=1, act=True): + # Sample mask and generate random permutation + lam, mask = sample_mask(self.alpha, self.decay_power, self.size, self.max_soft, self.reformulate) + mask = torch.from_numpy(mask).float().to(self.device) + + labels = torch.eye(self.class_num).to(self.device)[labels, :] # one-hot hard label + ori_inputs = inputs.clone().to(self.device) + fmix_inputs = inputs.clone().to(self.device) # duplicate inputs for outputs + lam_list = [] # a list to record operating ratio + indices = torch.randperm(self.batch_size, device=self.device) # shuffle indices + shuffled_inputs = inputs[indices].to(self.device) + shuffled_labels = labels[indices].to(self.device) + + for i in range(self.batch_size): + if np.random.randint(0, 101) > 100 * self.p or (not act): + # trigger the augmentation operation + lam_list.append(-1) + continue + + x1 = mask * ori_inputs[i] + x2 = (1 - mask) * shuffled_inputs[i] + fmix_inputs[i] = x1 + x2 + + lam_list.append(lam) + labels[i] = labels[i] * lam + shuffled_labels[i] * (1 - lam) + + long_label = labels.argmax(dim=1) + # print('lam:', lam) + return fmix_inputs, labels, long_label + + +# CellMix +class CellMix(object): + def __init__(self, shuffle_p=1.0, class_num=2, strategy='In-place', group_shuffle_size=-1, device='cpu'): + """ + CellMix augmentation arXiv:2301.11513 + :param shuffle_p: chance of trigger augmentation + :param class_num: number of classification categories + :param strategy: 'In-place' or 'Random' to shuffle the relation patches within the batch + :param group_shuffle_size: the size of shuffling group in the batch, -1 to all + :param device: CUDA or CPU + """ + self.p = shuffle_p + self.CLS = class_num # classification category number of the task + self.device = device + self.strategy = strategy # 'In-place' or 'Random' + self.group_shuffle_size = group_shuffle_size # -1 for whole batch + + def __call__(self, inputs, labels, fix_position_ratio=0.5, puzzle_patch_size=32, act=True): + """ + Fix-position in-place shuffling + Perform cross-sample random selection to fix some patches in each image of the batch + After selection, the fixed patches are reserved, the rest patches are batch wise + in-place shuffled and then regrouped with the fixed patches. + cross-sample selection is done by argsort random noise in dim 1 and apply to all image within the batch. + in-place batch-wise shuffle operation is done by argsort random noise in dim 0. + grouped-in-place batch-wise shuffle operation is done by argsort random noise in the batch dimension + + :param inputs: input image tensor, size of [B, 3, H, W], + :param labels: + :param fix_position_ratio: float ratio of the least remaining part of patches + :param puzzle_patch_size: int patch size of shuffle + :param act: set to be False to force not triggering CellMix in validation, set to True to trigger by chance p + + output: x, soft_label, long_label + x : [B, 3, H, W] re-grouped image after cellmix augmentation + soft_label : [B, CLS], soft-label of the class distribution + long_label : [B] hard long-label for general discribe + """ + if np.random.randint(0, 101) > 100 * self.p or (not act): + soft_label = torch.eye(self.CLS).to(self.device)[labels, :] # one-hot hard label + return inputs, soft_label, labels + + # Break img into puzzle patches with the size of puzzle_patch_size [B, num_patches, D] + inputs = patchify(inputs, puzzle_patch_size) + B, num_patches, D = inputs.shape + + # generate the persudo-mask: in cls dim only the k dim is + mask = torch.zeros([B, num_patches, self.CLS], device=inputs.device, requires_grad=False) # no grad + # mask of patches: (B, num_patches, cls) (cls)=[0,mask_area,0,....] + + # transform to persudo-mask + B_idx = range(B) + mask[B_idx, :, labels] = 1 + + # num of fix_position puzzle patches + len_fix_position = int(num_patches * fix_position_ratio) + + # create a noise tensor to prepare shuffle idx of puzzle patches + noise = torch.rand(1, num_patches, device=self.device) + noise = torch.repeat_interleave(noise, repeats=B, dim=0) + + # based on the batch sequence's shape, the noise tensor get a series idx matrix by sort + ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove + # sort the idx matrix again, we can obtain the original location idx matrix before assignment + ids_restore = torch.argsort(ids_shuffle, dim=1) + + # keep the first subset + ids_fix = ids_shuffle[:, :len_fix_position] # [B,num_patches] -> [B,fix_patches] + ids_puzzle = ids_shuffle[:, len_fix_position:] # [B,num_patches] -> [B,puzzle_patches] + + # set puzzle patch + # ids_?.unsqueeze(-1).repeat(1, 1, D) + # [B,?_patches] -> [B,?_patches,1] (at each place with the idx of ori patch) -> [B,?_patches,D] + + # torch.gather to select patche groups x_fixed of [B,fix_patches,D] and x_puzzle of [B,puzzle_patches,D] + x_fixed = torch.gather(inputs, dim=1, index=ids_fix.unsqueeze(-1).repeat(1, 1, D)) + x_puzzle = torch.gather(inputs, dim=1, index=ids_puzzle.unsqueeze(-1).repeat(1, 1, D)) + mask_fixed = torch.gather(mask, dim=1, index=ids_fix.unsqueeze(-1).repeat(1, 1, self.CLS)) + mask_puzzle = torch.gather(mask, dim=1, index=ids_puzzle.unsqueeze(-1).repeat(1, 1, self.CLS)) + + if self.strategy == 'In-place' or self.strategy == 'Random': + # the In-place strategy shuffles the relation patches within their location, among the batch index + B, num_shuffle_patches, D = x_puzzle.shape + + # create a noise tensor to prepare shuffle idx of puzzle patches + # [B, num_shuffle_patches] noise in [0, 1] + noise = torch.rand(B, num_shuffle_patches, device=self.device) + + if self.group_shuffle_size == -1 or self.group_shuffle_size == B: # CellMix-Split + # sort the noise matrix, obtain a index assignment for shuffle, + # shuffle dim 0 of entire noise (among all the batch) + in_place_shuffle_indices = torch.argsort(noise, dim=0) + + else: # CellMix-Group + assert B > self.group_shuffle_size > 0 and B % self.group_shuffle_size == 0 + grouped_indices_list = [] + for group_idx in range(B // self.group_shuffle_size): + # group the noise by self.group_shuffle_size: [group_shuffle_size,N] + grouped_noise = noise[group_idx * self.group_shuffle_size: + group_idx * self.group_shuffle_size + self.group_shuffle_size, :] + # sort each grouped_noise matrix, obtain a index assignment for shuffle, + # now the shuffle dim is 0 (among the batch within the group) + grouped_indices = torch.argsort(grouped_noise, dim=0) + # put grouped_noise matrix into the list + grouped_indices_list.append(grouped_indices + self.group_shuffle_size * group_idx) + # stack(cat) the group indices(from list) back to tensor + in_place_shuffle_indices = torch.cat(grouped_indices_list, dim=0) + + # torch.gather to achieve shuffle (taking all the idx base on a shuffled indices) + x_puzzle = torch.gather(x_puzzle, dim=0, index=in_place_shuffle_indices.unsqueeze(-1).repeat(1, 1, D)) + mask_puzzle = torch.gather(mask_puzzle, dim=0, + index=in_place_shuffle_indices.unsqueeze(-1).repeat(1, 1, self.CLS)) + else: + print('not a valid CellMix strategy') + + # pack up all puzzle patches + inputs = torch.cat([x_fixed, x_puzzle], dim=1) + mask = torch.cat([mask_fixed, mask_puzzle], dim=1) + + # unshuffle to restore the fixed positions + inputs = torch.gather(inputs, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, D)) + # torch.gather to generate restored binary mask + mask = torch.gather(mask, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, self.CLS)) + + # CellMix random strategy randomly shuffle the image patches (after cellmix in-place shuffle) + if self.strategy == 'Random': + B, num_patches, D = inputs.shape + # create a noise tensor to prepare shuffle idx of puzzle patches + noise = torch.rand(B, num_patches, device=self.device) # [num_patches,B] noise in [0, 1] + # sort the noise matrix, obtain a index assignment for shuffle, now the shuffle dim is 1 (with the batch) + all_shuffle_indices = torch.argsort(noise, dim=1) + # ids_shuffle shape of [B,N], in N is idx + # torch.gather to shuffle + inputs = torch.gather(inputs, dim=1, index=all_shuffle_indices.unsqueeze(-1).repeat(1, 1, D)) + # no need to torch the mask, because its patch-wise shuffle within each sample + else: # when strategy == 'In-place' + pass + + # unpatchify to obtain puzzle images and their mask + inputs = unpatchify(inputs, puzzle_patch_size) # restore to image size:B,3,224,224/ B,3,384,384 + + # transform soft-mask to soft-label + # calaculate a composed label with a conjugate design + # [B, num_patches, CLS]->(B, CLS) + soft_label = mask.sum(dim=1) # (B, CLS) + soft_label = soft_label / num_patches + # long_label, as a data-augmentation requirement + long_label = soft_label.argmax(dim=1) + + return inputs, soft_label, long_label + + +# ask func +def get_online_augmentation(augmentation_name, p=0.5, class_num=2, batch_size=4, edge_size=224, device='cpu'): + """ + :param augmentation_name: name of data-augmentation method + :param p: chance of triggering + :param class_num: classification task num + :param batch_size: batch size + :param edge_size: edge size of img + + :param device: cpu or cuda + + 其中augmentation_name, class_num, batch_size, edge_size必须提供 + """ + if augmentation_name == 'CellMix-Group': + Augmentation = CellMix(shuffle_p=p, class_num=class_num, strategy='In-place', group_shuffle_size=2, + device=device) + return Augmentation + + elif augmentation_name == 'CellMix-Group4': + Augmentation = CellMix(shuffle_p=p, class_num=class_num, strategy='In-place', group_shuffle_size=4, + device=device) + return Augmentation + + elif augmentation_name == 'CellMix-Split': + Augmentation = CellMix(shuffle_p=p, class_num=class_num, strategy='In-place', group_shuffle_size=-1, + device=device) + return Augmentation + + elif augmentation_name == 'CellMix-Random': + Augmentation = CellMix(shuffle_p=p, class_num=class_num, strategy='Random', group_shuffle_size=2, + device=device) + return Augmentation + + elif augmentation_name == 'CellMix-Random4': + Augmentation = CellMix(shuffle_p=p, class_num=class_num, strategy='Random', group_shuffle_size=4, + device=device) + return Augmentation + + elif augmentation_name == 'CellMix-Self': + Augmentation = CellMix(shuffle_p=p, class_num=class_num, strategy='Random', group_shuffle_size=1, + device=device) + return Augmentation + + elif augmentation_name == 'CellMix-All': + Augmentation = CellMix(shuffle_p=p, class_num=class_num, strategy='Random', group_shuffle_size=-1, + device=device) + return Augmentation + + elif augmentation_name == 'Cutout': + Augmentation = Cutout(alpha=2, shuffle_p=p, class_num=class_num, batch_size=batch_size, device=device) + return Augmentation + + elif augmentation_name == 'CutMix': + Augmentation = CutMix(alpha=2, shuffle_p=p, class_num=class_num, batch_size=batch_size, device=device) + return Augmentation + + elif augmentation_name == 'Mixup': + Augmentation = Mixup(alpha=2, shuffle_p=p, class_num=class_num, batch_size=batch_size, device=device) + return Augmentation + + elif augmentation_name == 'SaliencyMix': + Augmentation = SaliencyMix(alpha=1, shuffle_p=p, class_num=class_num, batch_size=batch_size, + device=device) # alpha实际为源代码中beta + return Augmentation + + elif augmentation_name == 'ResizeMix': + Augmentation = ResizeMix(shuffle_p=p, class_num=class_num, batch_size=batch_size, device=device) + return Augmentation + + elif augmentation_name == 'FMix': + # FMIX p=1.0 beacuse the chance of trigger is determined inside its own design + Augmentation = FMix(shuffle_p=1.0, class_num=class_num, batch_size=batch_size, + size=(edge_size, edge_size), device=device) + return Augmentation + + elif augmentation_name == 'PuzzleMix': + return None + # fixme: all related parts have been taken out seperately + # Augmentation = PuzzleMix(alpha=2, shuffle_p=p, class_num=class_num, batch_size=batch_size, device=device) + # return Augmentation + + elif augmentation_name == 'CoMix': + # TODO CoMix + return None + + elif augmentation_name == 'RandomMix': + # TODO RandomMix + return None + + else: + print('no valid counterparts augmentation selected') + return None + + +if __name__ == '__main__': + ''' + Augmentation = get_online_augmentation('CellMix-Split', p=0.5, class_num=2) + output, labels, GT_labels = Augmentation(x, label, fix_position_ratio=0.5, puzzle_patch_size=32, act=True) + + print(labels, GT_labels) + + ''' + + x = torch.load("./temp-tensors/warwick.pt") + # print(x.shape) + label = torch.load("./temp-tensors/warwick_labels.pt") + # print(label) + + # Augmentation = get_online_augmentation('ResizeMix', p=0.5, class_num=2) + # output, labels, GT_labels = Augmentation(x, label, act=True) + Augmentation = get_online_augmentation('CellMix-Group', p=1, class_num=2) + output, labels, GT_labels = Augmentation(x, label, fix_position_ratio=0.5, puzzle_patch_size=32, act=True) + + print(labels, GT_labels) + + composed_img = ToPILImage()(output[0]) + composed_img.show() + composed_img = ToPILImage()(output[1]) + composed_img.show() + composed_img = ToPILImage()(output[2]) + composed_img.show() + composed_img = ToPILImage()(output[3]) + composed_img.show() diff --git a/PuzzleTuning/utils/sam.py b/PuzzleTuning/utils/sam.py new file mode 100644 index 0000000000000000000000000000000000000000..1b44aad2787d00d56905e88b68c8a93c5f37440a --- /dev/null +++ b/PuzzleTuning/utils/sam.py @@ -0,0 +1,108 @@ +""" +版本: 8月26日 17:00 +SAM范化性训练,避免过拟合的优化器优化工具 ICLR 2021 spotlight paper by Google + +介绍:https://mp.weixin.qq.com/s/04VT-ldd0-XEkhEW6Txl_A +第三方实现来自:https://pub.towardsai.net/we-dont-need-to-worry-about-overfitting-anymore-9fb31a154c81 + +论文:Sharpness-aware Minimization for Efficiently Improving Generalization +链接:https://arxiv.org/abs/2010.01412 + +计算原理: +在训练过程中,优化器更新模型参数w时,整体上可以分为四个步骤: + +(1)基于参数 w 对 batch data S 计算 gradient G 。 + +(2)求解 G 的 dual norm,依照 dual vector 方向更新参数,得到 w+ε体系下的辅助模型。 + +(3)基于参数 w+ε 下的辅助模型,对 S 计算 gradient G’ 。 + +(4)用 G’ 更新原本的模型的参数 w 。 + + +使用例子: +from sam import SAM +... +model = YourModel() +base_optimizer = torch.optim.SGD # 传入一个优化器模板 +optimizer = SAM(model.parameters(), base_optimizer, lr=0.1, momentum=0.9) # 优化器参数 +... +for input, output in data: + + # first forward-backward pass,计算第一轮loss,这个和普通的一样 + # 第一轮的loss是真实模型跑出来的,我们统计中需要的loss就是它,第二轮loss不是真实的模型的loss(是辅助模型的),所以不需要用在传统统计loss中 + output = model(input) + loss = loss_function(output, labels) # use this loss for any training statistics!!!! + loss.backward() # 模型反向传播,记录原梯度。 + + # step1 的SAM类计算了“SAM梯度” + optimizer.first_step(zero_grad=True) # 第一轮opt用“SAM梯度”对原模型参数体系进行了更新,现在模型变成了辅助模型, + # step1记录保存了回到原模型参数体系的变化方法 + + # second forward-backward pass 第二轮先对辅助模型(step1更新后的模型)正向、反向传播 + output2 = model(input) # 用output2 确保计算图是辅助模型(即step1更新后的模型),不然有一堆bug。 + + # 由于新增了计算图,因此计算时间增加显存占用也增加? + + loss_function(output2, labels).backward() # make sure to do a full forward pass 辅助模型反向传播,记录更新梯度 + optimizer.second_step(zero_grad=True) # 第二轮,先原模型参数替换回去,之后base opt以辅助模型的更新方向对原模型参数体系进行更新 +... + + +""" +import torch + + +class SAM(torch.optim.Optimizer): + def __init__(self, params, base_optimizer, rho=0.05, **kwargs): + assert rho >= 0.0, f"Invalid rho, should be non-negative: {rho}" + + defaults = dict(rho=rho, **kwargs) + super(SAM, self).__init__(params, defaults) + + self.base_optimizer = base_optimizer(self.param_groups, **kwargs) + self.param_groups = self.base_optimizer.param_groups + + @torch.no_grad() + def first_step(self, zero_grad=False): # step1 生成辅助模型,对原模型参数进行修改把他变成辅助模型,同时记录怎么变的,以便还原 + grad_norm = self._grad_norm() + + for group in self.param_groups: + scale = group["rho"] / (grad_norm + 1e-12) # 附近的梯度影响 + + for p in group["params"]: + if p.grad is None: + continue + e_w = p.grad * scale.to(p) # 考虑附近的梯度影响之后,确定辅助模型的参数变化需要的“SAM梯度” + p.add_(e_w) # climb to the local maximum "w + e(w)" inplace参数更新! 因此是 原模型 变成了 辅助模型 + self.state[p]["e_w"] = e_w + + if zero_grad: + self.zero_grad() + + @torch.no_grad() + def second_step(self, zero_grad=False): # step2 先对辅助模型参数进行修改把他变回原模型, + # 之后对原模型基于辅助模型的梯度用base_optimizer进行参数更新 + + for group in self.param_groups: + + for p in group["params"]: + if p.grad is None: + continue + p.sub_(self.state[p]["e_w"]) # get back to "w" from "w + e(w)" + # 辅助模型参数还原,回到原模型 get back to "w" from "w + e(w)",注意这个也是inplace的!! + + self.base_optimizer.step() # 用base_optimizer对原模型进行参数更新 do the actual "sharpness-aware" update + + if zero_grad: + self.zero_grad() + + def _grad_norm(self): + shared_device = self.param_groups[0]["params"][0].device + # put everything on the same device, in case of model parallelism + norm = torch.norm( + torch.stack([ + p.grad.norm(p=2).to(shared_device) + for group in self.param_groups for p in group["params"] + if p.grad is not None]), p=2) + return norm diff --git a/PuzzleTuning/utils/schedulers.py b/PuzzleTuning/utils/schedulers.py new file mode 100644 index 0000000000000000000000000000000000000000..1132d41f23de04d9127941a79e8ea453d06ba508 --- /dev/null +++ b/PuzzleTuning/utils/schedulers.py @@ -0,0 +1,285 @@ +""" +Schedulers Script ver: Feb 15th 17:00 + +puzzle_patch_scheduler is used to arrange patch size for multi-scale learning + +ref +lr_scheduler from MAE code. +https://github.com/facebookresearch/mae +""" + +import math +import random + + +def factor(num): + """ + find factor of input num + """ + factors = [] + for_times = int(math.sqrt(num)) + for i in range(for_times + 1)[1:]: + if num % i == 0: + factors.append(i) + t = int(num / i) + if not t == i: + factors.append(t) + return factors + + +def defactor(num_list, basic_num): # check multiples + array = [] + for i in num_list: + if i // basic_num * basic_num - i == 0: + array.append(i) + array.sort() # accend + return array + + +def adjust_learning_rate(optimizer, epoch, args): + """ + Decay the learning rate with half-cycle cosine after warmup + epoch,ok with float,to be more flexible, + like: data_iter_step / len(data_loader) + epoch + """ + # calculate the lr for this time + if epoch < args.warmup_epochs: # for warmup + lr = args.lr * epoch / args.warmup_epochs # lr increase from zero to the setted lr + + else: # after warmup do cosin lr decay + lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \ + (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs))) + + # update lr in the optmizer + for param_group in optimizer.param_groups: + if "lr_scale" in param_group: + param_group["lr"] = lr * param_group["lr_scale"] + else: + param_group["lr"] = lr + return lr + + +class patch_scheduler: + """ + this is used to drive the patch size by loss and epoch + the patch list is automatically get + """ + + def __init__(self, total_epoches=200, warmup_epochs=20, edge_size=384, basic_patch=16, strategy=None, + threshold=3.0, reducing_factor=0.933, fix_patch_size=None, patch_size_jump=None): + super().__init__() + + self.strategy = strategy + + self.total_epoches = total_epoches + self.warmup_epochs = warmup_epochs + + # automatically build legal patch list, from small to big size + self.patch_list = defactor(factor(edge_size), basic_patch) + + self.threshold = threshold + self.reducing_factor = reducing_factor + self.fix_patch_size = fix_patch_size + + # from small to big patch, No need for patch at all fig level + if len(self.patch_list) > 1: + self.patch_list = self.patch_list[:-1] + + # jump_patch_list by selecting the 'odd' or 'even', but both with the smallest patch size + if patch_size_jump == 'odd': # 384:[196, 96, 48, 16] + jump_patch_list = self.patch_list[0::2] + self.patch_list = jump_patch_list + elif patch_size_jump == 'even': # 384:[128, 64, 32, 16] + jump_patch_list = self.patch_list[1::2] + # add back the smallest + temp_list = [self.patch_list[0]] + temp_list.extend(jump_patch_list) + self.patch_list = temp_list + else: # all + pass + + if self.strategy in ['reverse', 'loss_back', 'loss_hold']: # start from big(easy) to samll(complex) + self.patch_list.sort(reverse=True) + + if self.strategy is None or self.strategy == 'fixed': + puzzle_patch_size = self.fix_patch_size or self.patch_list[0] + print('patch_list:', puzzle_patch_size) + else: + print('patch_list:', self.patch_list) + + # self.loss_log ? + + def __call__(self, epoch, loss=0.0): + # Designed for flexable ablations + if self.strategy == 'linear' or self.strategy == 'reverse': # reverse from big size to small + if epoch < self.warmup_epochs: # warmup + puzzle_patch_size = 32 # fixed size for warmup + else: + puzzle_patch_size = self.patch_list[min(int((epoch - self.warmup_epochs) + / (self.total_epoches - self.warmup_epochs) + * len(self.patch_list)), len(self.patch_list) - 1)] + + elif self.strategy == 'loop': + # looply change the patch size, after [group_size] epoches we change once + group_size = int(self.threshold) + + if epoch < self.warmup_epochs: + puzzle_patch_size = 32 # in warm up epoches, fixed patch size at 32 fixme exploring + else: + group_idx = (epoch - self.warmup_epochs) % (len(self.patch_list) * group_size) + puzzle_patch_size = self.patch_list[int(group_idx / group_size)] + + elif self.strategy == 'random': # random size strategy + puzzle_patch_size = random.choice(self.patch_list) + + elif self.strategy == 'loss_back': + if epoch < self.warmup_epochs: # for warmup + puzzle_patch_size = 32 # in warm-up we use the fix size + else: + if loss == 0.0: + puzzle_patch_size = self.patch_list[min(int((epoch - self.warmup_epochs) + / (self.total_epoches - self.warmup_epochs) + * len(self.patch_list)), len(self.patch_list) - 1)] + + elif loss < self.threshold: + puzzle_patch_size = self.patch_list[min(max(int((epoch - self.warmup_epochs) + / (self.total_epoches - self.warmup_epochs) + * len(self.patch_list)) + 1, 0), + len(self.patch_list) - 1)] + self.threshold *= self.reducing_factor + else: + puzzle_patch_size = self.patch_list[min(max(int((epoch - self.warmup_epochs) + / (self.total_epoches - self.warmup_epochs) + * len(self.patch_list)) - 1, 0), + len(self.patch_list) - 1)] + + elif self.strategy == 'loss_hold': + if epoch < self.warmup_epochs: # for warmup + puzzle_patch_size = 32 # in warm-up we use the fix size + else: + if loss == 0.0: + puzzle_patch_size = self.patch_list[min(int((epoch - self.warmup_epochs) + / (self.total_epoches - self.warmup_epochs) + * len(self.patch_list)), len(self.patch_list) - 1)] + + elif loss < self.threshold: + puzzle_patch_size = self.patch_list[min(max(int((epoch - self.warmup_epochs) + / (self.total_epoches - self.warmup_epochs) + * len(self.patch_list)) + 1, 0), + len(self.patch_list) - 1)] + self.threshold *= self.reducing_factor + else: + puzzle_patch_size = self.patch_list[min(max(int((epoch - self.warmup_epochs) + / (self.total_epoches - self.warmup_epochs) + * len(self.patch_list)), 0), + len(self.patch_list) - 1)] + + else: + # if self.strategy is None or 'fixed' or 'ratio-decay' + puzzle_patch_size = self.fix_patch_size or self.patch_list[0] # basic_patch + + return puzzle_patch_size + + +class ratio_scheduler: + """ + this is used to drive the fix position ratio by loss and epoch + the ratio is control by ratio_floor_factor=0.5, upper_limit=0.9, lower_limit=0.2 + """ + def __init__(self, total_epoches=200, warmup_epochs=20, basic_ratio=0.25, strategy=None, fix_position_ratio=None, + threshold=4.0, loss_reducing_factor=0.933, ratio_floor_factor=0.5, upper_limit=0.9, lower_limit=0.2): + + # fixme basic_ratio and fix_position_ratio(when stage is fixed) is a bit conflicting, not good enough + super().__init__() + self.strategy = strategy + + self.total_epoches = total_epoches + self.warmup_epochs = warmup_epochs + + self.basic_ratio = basic_ratio + + self.threshold = threshold + self.loss_reducing_factor = loss_reducing_factor + + self.fix_position_ratio = fix_position_ratio + + self.upper_limit = upper_limit + self.lower_limit = lower_limit + self.ratio_floor_factor = ratio_floor_factor + + def __call__(self, epoch, loss=0.0): + if self.strategy == 'ratio-decay' or self.strategy == 'decay': + if epoch < self.warmup_epochs: # for warmup + fix_position_ratio = self.basic_ratio # fixed + else: + max_ratio = min(3 * self.basic_ratio, self.upper_limit) # upper-limit of 0.9 + min_ratio = max(self.basic_ratio * self.ratio_floor_factor, self.lower_limit) + + fix_position_ratio = min(max(((self.total_epoches - self.warmup_epochs) + - (epoch - self.warmup_epochs)) / + (self.total_epoches - self.warmup_epochs) + * max_ratio, min_ratio), max_ratio) + + elif self.strategy == 'loss_back': + + if epoch < self.warmup_epochs: # for warmup + fix_position_ratio = self.basic_ratio # in warm-up we use the fix ratio + + else: + max_ratio = min(3 * self.basic_ratio, self.upper_limit) + min_ratio = max(self.basic_ratio * self.ratio_floor_factor, self.lower_limit) + if loss == 0.0: + fix_position_ratio = min(max(((self.total_epoches - self.warmup_epochs) + - (epoch - self.warmup_epochs)) / + (self.total_epoches - self.warmup_epochs) + * max_ratio, min_ratio), max_ratio) + elif loss < self.threshold: + fix_position_ratio = min(max(((self.total_epoches - self.warmup_epochs) + - (epoch - self.warmup_epochs)) / + (self.total_epoches - self.warmup_epochs) + * max_ratio * 0.9, min_ratio), max_ratio) + self.threshold *= self.loss_reducing_factor + else: + fix_position_ratio = min(max(((self.total_epoches - self.warmup_epochs) + - (epoch - self.warmup_epochs)) / + (self.total_epoches - self.warmup_epochs) + * max_ratio * 1.1, min_ratio), max_ratio) + + elif self.strategy == 'loss_hold': + + if epoch < self.warmup_epochs: # for warmup + fix_position_ratio = self.basic_ratio # in warm-up we use the fix ratio + + else: + max_ratio = min(3 * self.basic_ratio, self.upper_limit) + min_ratio = max(self.basic_ratio * self.ratio_floor_factor, self.lower_limit) + + if loss == 0.0: + fix_position_ratio = min(max(((self.total_epoches - self.warmup_epochs) + - (epoch - self.warmup_epochs)) / + (self.total_epoches - self.warmup_epochs) + * max_ratio, min_ratio), max_ratio) + elif loss < self.threshold: + fix_position_ratio = min(max(((self.total_epoches - self.warmup_epochs) + - (epoch - self.warmup_epochs)) / + (self.total_epoches - self.warmup_epochs) + * max_ratio * 0.9, min_ratio), max_ratio) + self.threshold *= self.loss_reducing_factor + else: + fix_position_ratio = min(max(((self.total_epoches - self.warmup_epochs) + - (epoch - self.warmup_epochs)) / + (self.total_epoches - self.warmup_epochs) + * max_ratio, min_ratio), max_ratio) + + else: # basic_ratio + fix_position_ratio = self.fix_position_ratio or self.basic_ratio + + return fix_position_ratio + + +''' +scheduler = puzzle_fix_position_ratio_scheduler(strategy='reverse') +epoch = 102 +fix_position_ratio = scheduler(epoch) +print(fix_position_ratio) +''' diff --git a/PuzzleTuning/utils/temp-tensors/color.pt b/PuzzleTuning/utils/temp-tensors/color.pt new file mode 100644 index 0000000000000000000000000000000000000000..058c6edd177162c02e2cf7e802a0c9191b5897b4 --- /dev/null +++ b/PuzzleTuning/utils/temp-tensors/color.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63b58afc3d30f2cead67e4e9d77d0509a9cc547cc0a26390e07204e8f8f9ff0c +size 2409195 diff --git a/PuzzleTuning/utils/temp-tensors/color_labels.pt b/PuzzleTuning/utils/temp-tensors/color_labels.pt new file mode 100644 index 0000000000000000000000000000000000000000..135a55b90ab671cb650ff50862582cb120d6e69f --- /dev/null +++ b/PuzzleTuning/utils/temp-tensors/color_labels.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25f4d6316bbcab066eaf26be4e418547a468d3f5d45c274535306bd37ca814b6 +size 747 diff --git a/PuzzleTuning/utils/temp-tensors/warwick.pt b/PuzzleTuning/utils/temp-tensors/warwick.pt new file mode 100644 index 0000000000000000000000000000000000000000..e37fa21a753b5c76591757775ad2e267651af424 --- /dev/null +++ b/PuzzleTuning/utils/temp-tensors/warwick.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ec6955b6afe4692ee4ead145f50e052e5c4481409beeda42bc4377d6fe8f579 +size 2409195 diff --git a/PuzzleTuning/utils/temp-tensors/warwick_labels.pt b/PuzzleTuning/utils/temp-tensors/warwick_labels.pt new file mode 100644 index 0000000000000000000000000000000000000000..edca0ee35b3cf03395e6319763430844f128bbb0 --- /dev/null +++ b/PuzzleTuning/utils/temp-tensors/warwick_labels.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65c8a01f50cf37cccc74514ada3f2e4d696133652404b2bb4313cfd0fe6df063 +size 747 diff --git a/PuzzleTuning/utils/tools.py b/PuzzleTuning/utils/tools.py new file mode 100644 index 0000000000000000000000000000000000000000..25b5c9f3430679496e1de56a6104d4f715bc345a --- /dev/null +++ b/PuzzleTuning/utils/tools.py @@ -0,0 +1,144 @@ +""" +Tools Script ver: Feb 22rd 20:00 +""" +import os +import shutil +import torch +import numpy as np +from collections import OrderedDict + + +# Tools +def del_file(filepath): + """ + clear all items within a folder + :param filepath: folder path + :return: + """ + del_list = os.listdir(filepath) + for f in del_list: + file_path = os.path.join(filepath, f) + if os.path.isfile(file_path): + os.remove(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + + +def to_2tuple(input): + if type(input) is tuple: + if len(input) == 2: + return input + else: + if len(input) > 2: + output = (input[0], input[1]) + return output + elif len(input) == 1: + output = (input[0], input[0]) + return output + else: + print('cannot handle none tuple') + else: + if type(input) is list: + if len(input) == 2: + output = (input[0], input[1]) + return output + else: + if len(input) > 2: + output = (input[0], input[1]) + return output + elif len(input) == 1: + output = (input[0], input[0]) + return output + else: + print('cannot handle none list') + elif type(input) is int: + output = (input, input) + return output + else: + print('cannot handle ', type(input)) + raise ('cannot handle ', type(input)) + + +def find_all_files(root, suffix=None): + """ + Return a list of file paths ended with specific suffix + """ + res = [] + if type(suffix) is tuple or type(suffix) is list: + for root, _, files in os.walk(root): + for f in files: + if suffix is not None: + status = 0 + for i in suffix: + if not f.endswith(i): + pass + else: + status = 1 + break + if status == 0: + continue + res.append(os.path.join(root, f)) + return res + + elif type(suffix) is str or suffix is None: + for root, _, files in os.walk(root): + for f in files: + if suffix is not None and not f.endswith(suffix): + continue + res.append(os.path.join(root, f)) + return res + + else: + print('type of suffix is not legal :', type(suffix)) + return -1 + + +# Transfer state_dict by removing misalignment +def FixStateDict(state_dict, remove_key_head=None): + """ + Obtain a fixed state_dict by removing misalignment + + :param state_dict: model state_dict of OrderedDict() + :param remove_key_head: the str or list of strings need to be remove by startswith + """ + + if remove_key_head is None: + return state_dict + + elif type(remove_key_head) == str: + keys = [] + for k, v in state_dict.items(): + if k.startswith(remove_key_head): # 将‘arc’开头的key过滤掉,这里是要去除的层的key + continue + keys.append(k) + + elif type(remove_key_head) == list: + keys = [] + for k, v in state_dict.items(): + jump = False + for a_remove_key_head in remove_key_head: + if k.startswith(a_remove_key_head): # 将‘arc’开头的key过滤掉,这里是要去除的层的key + jump = True + break + if jump: + continue + else: + keys.append(k) + else: + print('erro in defining remove_key_head !') + return -1 + + new_state_dict = OrderedDict() + for k in keys: + new_state_dict[k] = state_dict[k] + return new_state_dict + + +def setup_seed(seed): # setting up the random seed + import random + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + np.random.seed(seed) + random.seed(seed) + torch.backends.cudnn.deterministic = True + diff --git a/PuzzleTuning/utils/transfermodel.py b/PuzzleTuning/utils/transfermodel.py new file mode 100644 index 0000000000000000000000000000000000000000..f353f6be52e0027bce76b9f547d4507a30ebba83 --- /dev/null +++ b/PuzzleTuning/utils/transfermodel.py @@ -0,0 +1,300 @@ +""" +Transfer PuzzleTuning Pre-Training checkpoints Script ver: Oct 23rd 17:00 + +write a model based on the weight of a checkpoint file +EG: create a vit-base based on PuzzleTuning SAE + +""" +import argparse + +import sys +sys.path.append('..') +import os +import torch +import torch.nn as nn + +from Backbone import getmodel, GetPromptModel +from SSL_structures import SAE + + +# Transfer pretrained MSHT checkpoints to normal model state_dict +def transfer_model_encoder(check_point_path, save_model_path, model_idx='ViT', prompt_mode=None, + Prompt_Token_num=20, edge_size=384, given_name=None): + if not os.path.exists(save_model_path): + os.makedirs(save_model_path) + + if given_name is not None: + given_path = os.path.join(save_model_path, given_name) + else: + given_path = None + + if prompt_mode == "Deep" or prompt_mode == "Shallow": + model = GetPromptModel.build_promptmodel(edge_size=edge_size, model_idx=model_idx, patch_size=16, + Prompt_Token_num=Prompt_Token_num, VPT_type=prompt_mode, + base_state_dict=None) + # elif prompt_mode == "Other" or prompt_mode == None: + else: + model = getmodel.get_model(model_idx=model_idx, pretrained_backbone=False, edge_size=edge_size) + ''' + state = {'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch} + TempBest_state = {'model': best_model_wts, 'epoch': best_epoch_idx} + ''' + state = torch.load(check_point_path) + + transfer_name = os.path.splitext(os.path.split(check_point_path)[1])[0] + '_of_' + + try: + model_state = state['model'] + try: + print("checkpoint epoch", state['epoch']) + if prompt_mode is not None: + save_model_path = os.path.join(save_model_path, transfer_name + + model_idx + '_E_' + str(state['epoch']) + '_promptstate' + '.pth') + else: + save_model_path = os.path.join(save_model_path, transfer_name + + model_idx + '_E_' + str(state['epoch']) + '_transfer' + '.pth') + + except: + print("no 'epoch' in state") + if prompt_mode is not None: + save_model_path = os.path.join(save_model_path, transfer_name + model_idx + '_promptstate' + '.pth') + else: + save_model_path = os.path.join(save_model_path, transfer_name + model_idx + '_transfer' + '.pth') + except: + print("not a checkpoint state (no 'model' in state)") + model_state = state + if prompt_mode is not None: + save_model_path = os.path.join(save_model_path, transfer_name + model_idx + '_promptstate' + '.pth') + else: + save_model_path = os.path.join(save_model_path, transfer_name + model_idx + '_transfer' + '.pth') + + try: + model.load_state_dict(model_state) + print("model loaded") + print("model :", model_idx) + gpu_use = 0 + except: + try: + model = nn.DataParallel(model) + model.load_state_dict(model_state, False) + print("DataParallel model loaded") + print("model :", model_idx) + gpu_use = -1 + except: + print("model loading erro!!") + gpu_use = -2 + + if given_path is not None: + save_model_path = given_path + + if gpu_use == -1: + # print(model) + if prompt_mode is not None: + prompt_state_dict = model.module.obtain_prompt() + # fixme maybe bug at DP module.obtain_prompt, just model.obtain_prompt is enough + print('prompt obtained') + torch.save(prompt_state_dict, save_model_path) + else: + torch.save(model.module.state_dict(), save_model_path) + print('model trained by multi-GPUs has its single GPU copy saved at ', save_model_path) + + elif gpu_use == 0: + if prompt_mode is not None: + prompt_state_dict = model.obtain_prompt() + print('prompt obtained') + torch.save(prompt_state_dict, save_model_path) + else: + torch.save(model.state_dict(), save_model_path) + print('model trained by a single GPU has been saved at ', save_model_path) + else: + print('erro') + + +def transfer_model_decoder(check_point_path, save_model_path, + model_idx='sae_vit_base_patch16_decoder', dec_idx='swin_unet', + prompt_mode=None, Prompt_Token_num=20, edge_size=384): + + if not os.path.exists(save_model_path): + os.makedirs(save_model_path) + + state = torch.load(check_point_path) + + transfer_name = os.path.splitext(os.path.split(check_point_path)[1])[0] + '_of_' + + model = SAE.__dict__[model_idx](img_size=edge_size, prompt_mode=prompt_mode, Prompt_Token_num=Prompt_Token_num, + basic_state_dict=None, dec_idx=dec_idx) + + try: + model_state = state['model'] + try: + print("checkpoint epoch", state['epoch']) + save_model_path = os.path.join(save_model_path, transfer_name + 'Decoder_' + dec_idx + '_E_' + + str(state['epoch']) + '.pth') + + + except: + print("no 'epoch' in state") + save_model_path = os.path.join(save_model_path, transfer_name + 'Decoder_' + dec_idx + '.pth') + except: + print("not a checkpoint state (no 'model' in state)") + model_state = state + save_model_path = os.path.join(save_model_path, transfer_name + 'Decoder_' + dec_idx + '.pth') + + try: + model.load_state_dict(model_state) + print("model loaded") + print("model :", model_idx) + gpu_use = 0 + except: + try: + model = nn.DataParallel(model) + model.load_state_dict(model_state, False) + print("DataParallel model loaded") + print("model :", model_idx) + gpu_use = -1 + except: + print("model loading erro!!") + gpu_use = -2 + + else: + model = model.decoder + + if gpu_use == -1: + torch.save(model.module.decoder.state_dict(), save_model_path) + print('model trained by multi-GPUs has its single GPU copy saved at ', save_model_path) + + elif gpu_use == 0: + torch.save(model.state_dict(), save_model_path) + print('model trained by a single GPU has been saved at ', save_model_path) + else: + print('erro') + + +def get_args_parser(): + parser = argparse.ArgumentParser('Take pre-trained model from PuzzleTuning', add_help=False) + + # Model Name or index + parser.add_argument('--given_name', default=None, type=str, help='name of the weight-state-dict') + parser.add_argument('--model_idx', default='ViT', type=str, help='taking the weight to the specified model') + parser.add_argument('--edge_size', default=224, type=int, help='images input size for model') + + # PromptTuning + parser.add_argument('--PromptTuning', default=None, type=str, + help='Deep/Shallow to use Prompt Tuning model instead of Finetuning model, by default None') + # Prompt_Token_num + parser.add_argument('--Prompt_Token_num', default=20, type=int, help='Prompt_Token_num') + + # PATH settings + parser.add_argument('--checkpoint_path', default=None, type=str, help='check_point_path') + parser.add_argument('--save_model_path', default=None, type=str, help='out put weight path for pre-trained model') + + return parser + + +def main(args): + # fixme: now need a CUDA device as the model is save as a CUDA model! + + + # PuzzleTuning Template + """ + # Prompt + # transfer_model_encoder(checkpoint_path, save_model_path, model_idx='ViT', edge_size=224, prompt_mode='Deep', Prompt_Token_num=20,given_name=given_name) + + # not prompt model + # transfer_model_encoder(checkpoint_path, save_model_path, model_idx='ViT', edge_size=224, given_name=given_name) + + # decoder + # transfer_model_decoder(checkpoint_path, save_model_path, model_idx='sae_vit_base_patch16_decoder', dec_idx='swin_unet', edge_size=224, prompt_mode='Deep') + + + # PuzzleTuning Experiments transfer records: + # 1 周期puzzle自动减小ratio,自动loop变化size 迁移timm,PromptTuning:VPT-Deep,seg_decoder:None (核心方法) + # ViT_b16_224_timm_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth + checkpoint_path = '/root/autodl-tmp/runs/PuzzleTuning_SAE_vit_base_patch16_Prompt_Deep_tokennum_20_tr_timm_CPIAm/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20_checkpoint-199.pth' + save_model_path = '/root/autodl-tmp/output_models' + given_name = r'ViT_b16_224_timm_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_E_50_promptstate.pth' + transfer_model_encoder(checkpoint_path, save_model_path, model_idx='ViT', edge_size=224, prompt_mode='Deep', + Prompt_Token_num=20,given_name=given_name) + + # PuzzleTuning Ablation studies:SAE+不同curriculum+不同VPT/ViT + # 2 周期puzzle自动减小ratio,自动loop变化size 迁移timm,PromptTuning:None,seg_decoder:None + # ViT_b16_224_timm_PuzzleTuning_SAE_CPIAm_E_199.pth + checkpoint_path = '/root/autodl-tmp/runs/PuzzleTuning_SAE_vit_base_patch16_tr_timm_CPIAm/PuzzleTuning_sae_vit_base_patch16_checkpoint-199.pth' + save_model_path = '/root/autodl-tmp/output_models' + given_name = r'ViT_b16_224_timm_PuzzleTuning_SAE_CPIAm_E_199.pth' + transfer_model_encoder(checkpoint_path, save_model_path, model_idx='ViT', edge_size=224, given_name=given_name) + + # 3 固定puzzle ratio,固定patch size 迁移timm,PromptTuning:VPT-Deep,seg_decoder:None (服务器pt1) + # ViT_b16_224_timm_PuzzleTuning_fixp16fixr25_SAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth + checkpoint_path = '/root/autodl-tmp/runs/PuzzleTuning_SAE_fixp16fixr25_vit_base_Prompt_Deep_tokennum_20_tr_timm_CPIAm/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20_checkpoint-199.pth' + save_model_path = '/root/autodl-tmp/output_models' + given_name = r'ViT_b16_224_timm_PuzzleTuning_SAE_fixp16fixr25_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth' + transfer_model_encoder(checkpoint_path, save_model_path, model_idx='ViT', edge_size=224, prompt_mode='Deep', + Prompt_Token_num=20, given_name=given_name) + + # 4 固定puzzle ratio,固定patch size 迁移timm,PromptTuning:None,seg_decoder:None (服务器pt2) + # ViT_b16_224_timm_PuzzleTuning_fixp16fixr25_SAE_CPIAm_E_199.pth + checkpoint_path = '/root/autodl-tmp/runs/PuzzleTuning_SAE_fixp16fixr25_vit_base_patch16_tr_timm_CPIAm/PuzzleTuning_sae_vit_base_patch16_checkpoint-199.pth' + save_model_path = '/root/autodl-tmp/output_models' + given_name = r'ViT_b16_224_timm_PuzzleTuning_SAE_fixp16fixr25_CPIAm_E_199.pth' + transfer_model_encoder(checkpoint_path, save_model_path, model_idx='ViT', edge_size=224, given_name=given_name) + + # 5 变化puzzle ratio,固定patch size 迁移timm,PromptTuning:VPT-Deep,seg_decoder:None, strategy: ratio-decay (服务器pt3) + # ViT_b16_224_timm_PuzzleTuning_fixp16ratiodecay_SAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth + checkpoint_path = '/root/autodl-tmp/runs/PuzzleTuning_SAE_fixp16ratiodecay_vit_base_Prompt_Deep_tokennum_20_tr_timm_CPIAm/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20_checkpoint-199.pth' + save_model_path = '/root/autodl-tmp/output_models' + given_name = r'ViT_b16_224_timm_PuzzleTuning_SAE_fixp16ratiodecay_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth' + transfer_model_encoder(checkpoint_path, save_model_path, model_idx='ViT', edge_size=224, prompt_mode='Deep', + Prompt_Token_num=20, given_name=given_name) + + # 6 变化puzzle ratio,固定patch size 迁移timm,PromptTuning:None,seg_decoder:None (服务器pt4) + # ViT_b16_224_timm_PuzzleTuning_fixp16ratiodecay_SAE_CPIAm_E_199.pth + checkpoint_path = '/root/autodl-tmp/runs/PuzzleTuning_SAE_fixp16ratiodecay_vit_base_patch16_tr_timm_CPIAm/PuzzleTuning_sae_vit_base_patch16_checkpoint-199.pth' + save_model_path = '/root/autodl-tmp/output_models' + given_name = r'ViT_b16_224_timm_PuzzleTuning_SAE_fixp16ratiodecay_CPIAm_E_199.pth' + transfer_model_encoder(checkpoint_path, save_model_path, model_idx='ViT', edge_size=224, given_name=given_name) + + # PuzzleTuning Ablation studies:上游不要puzzle 所以是 VPT+MAE + # 7 MAE+VPT,迁移timm,PromptTuning:VPT-Deep,seg_decoder:None (A40*4服务器pt5) + # ViT_b16_224_timm_PuzzleTuning_MAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth + checkpoint_path = '/root/autodl-tmp/runs/PuzzleTuning_MAE_vit_base_Prompt_Deep_tokennum_20_tr_timm_CPIAm/PuzzleTuning_mae_vit_base_patch16_Prompt_Deep_tokennum_20_checkpoint-199.pth' + save_model_path = '/root/autodl-tmp/output_models' + given_name = r'ViT_b16_224_timm_PuzzleTuning_MAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth' + transfer_model_encoder(checkpoint_path, save_model_path, model_idx='ViT', edge_size=224, prompt_mode='Deep', + Prompt_Token_num=20, given_name=given_name) + + # 8 周期puzzle自动减小ratio,自动loop变化size 迁移MAEImageNet,PromptTuning:VPT-Deep,seg_decoder:None (A100-PCIE*2 服务器pt6) + # ViT_b16_224_MAEImageNet_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth + checkpoint_path = '/root/autodl-tmp/runs/PuzzleTuning_SAE_vit_base_patch16_Prompt_Deep_tokennum_20_tr_MAEImageNet_CPIAm/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20_checkpoint-199.pth' + save_model_path = '/root/autodl-tmp/output_models' + given_name = r'ViT_b16_224_MAEImageNet_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth' + transfer_model_encoder(checkpoint_path, save_model_path, model_idx='ViT', edge_size=224, prompt_mode='Deep', + Prompt_Token_num=20, given_name=given_name) + + # 9 周期puzzle自动减小ratio,自动loop变化size 迁移Random,PromptTuning:VPT-Deep,seg_decoder:None (A40*4服务器pt7) + # ViT_b16_224_Random_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth + checkpoint_path = '/root/autodl-tmp/runs/PuzzleTuning_SAE_vit_base_patch16_Prompt_Deep_tokennum_20_tr_Random_CPIAm/PuzzleTuning_sae_vit_base_patch16_Prompt_Deep_tokennum_20_checkpoint-199.pth' + save_model_path = '/root/autodl-tmp/output_models' + given_name = r'ViT_b16_224_Random_PuzzleTuning_SAE_CPIAm_Prompt_Deep_tokennum_20_E_199_promptstate.pth' + transfer_model_encoder(checkpoint_path, save_model_path, model_idx='ViT', edge_size=224, prompt_mode='Deep', + Prompt_Token_num=20, given_name=given_name) + + # 10 周期puzzle自动减小ratio,自动loop变化size 迁移Random,PromptTuning:None,seg_decoder:None (4090*6服务器pt8) + # ViT_b16_224_MAEImageNet_PuzzleTuning_SAE_CPIAm_E_199.pth + checkpoint_path = '/root/autodl-tmp/runs/PuzzleTuning_SAE_vit_base_patch16_tr_MAEImageNet_CPIAm/PuzzleTuning_sae_vit_base_patch16_checkpoint-199.pth' + save_model_path = '/root/autodl-tmp/output_models' + given_name = r'ViT_b16_224_MAEImageNet_PuzzleTuning_SAE_CPIAm_E_199.pth' + transfer_model_encoder(checkpoint_path, save_model_path, model_idx='ViT', edge_size=224, given_name=given_name) + """ + + transfer_model_encoder(args.checkpoint_path, args.save_model_path, + model_idx=args.model_idx, edge_size=args.edge_size, + prompt_mode=args.PromptTuning, Prompt_Token_num=args.Prompt_Token_num, + given_name=args.given_name) + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + + main(args) diff --git a/PuzzleTuning/utils/visual_usage.py b/PuzzleTuning/utils/visual_usage.py new file mode 100644 index 0000000000000000000000000000000000000000..cf3d27c2e208212caaece2df1f3881c66126e4f4 --- /dev/null +++ b/PuzzleTuning/utils/visual_usage.py @@ -0,0 +1,509 @@ +""" +Attention Visulization Script ver: Oct 23rd 18:00 +use rgb format input +""" + +import torch +import torch.nn as nn +import numpy as np +import matplotlib.pyplot as plt +import os +from PIL import Image +from torchvision.transforms import ToPILImage + + +def softmax(x): + """Compute the softmax in a numerically stable way.""" + sof = nn.Softmax() + return sof(x) + + +def imshow(inp, title=None): # Imshow for Tensor + """Imshow for Tensor.""" + inp = inp.numpy().transpose((1, 2, 0)) + ''' + # if required: Alter the transform + # because transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + mean = np.array([0.485, 0.456, 0.406]) + std = np.array([0.229, 0.224, 0.225]) + inp = std * inp + mean + inp = np.clip(inp, 0, 1) + ''' + plt.imshow(inp) + if title is not None: + plt.title(title) + plt.pause(0.001) # pause a bit so that plots are updated + + +def Draw_tri_fig(Ori_img, Puz_img, Rec_img, picpath): + plt.figure() + + ax = plt.subplot(1, 3, 1) + ax.axis('off') + ax.set_title('Original') + plt.imshow(Ori_img) + + ax = plt.subplot(1, 3, 2) + ax.axis('off') + ax.set_title('Puzzle') + plt.imshow(Puz_img) + + ax = plt.subplot(1, 3, 3) + ax.axis('off') + ax.set_title('Restored') + plt.imshow(Rec_img) + + plt.savefig(picpath, dpi=400) + plt.show() + + plt.cla() + plt.close("all") + + +# Grad CAM part:Visualize of CNN+Transformer attention area +def cls_token_s12_transform(tensor, height=12, width=12): # based on pytorch_grad_cam + result = tensor[:, 1:, :].reshape(tensor.size(0), height, width, tensor.size(2)) + + # Bring the channels to the first dimension, + # like in CNNs. + result = result.transpose(2, 3).transpose(1, 2) + return result + + +def cls_token_s14_transform(tensor, height=14, width=14): # based on pytorch_grad_cam + result = tensor[:, 1:, :].reshape(tensor.size(0), height, width, tensor.size(2)) + + # Bring the channels to the first dimension, + # like in CNNs. + result = result.transpose(2, 3).transpose(1, 2) + return result + + +def cls_token_s16_transform(tensor, height=16, width=16): # based on pytorch_grad_cam + result = tensor[:, 1:, :].reshape(tensor.size(0), height, width, tensor.size(2)) + + # Bring the channels to the first dimension, + # like in CNNs. + result = result.transpose(2, 3).transpose(1, 2) + return result + + +def cls_token_s24_transform(tensor, height=24, width=24): # based on pytorch_grad_cam + result = tensor[:, 1:, :].reshape(tensor.size(0), height, width, tensor.size(2)) + + # Bring the channels to the first dimension, + # like in CNNs. + result = result.transpose(2, 3).transpose(1, 2) + return result + + +def no_cls_token_s12_transform(tensor, height=12, width=12): # based on pytorch_grad_cam + result = tensor.reshape(tensor.size(0), height, width, tensor.size(2)) + + # Bring the channels to the first dimension, + # like in CNNs. + result = result.transpose(2, 3).transpose(1, 2) + return result + + +def swinT_transform_224(tensor, height=7, width=7): # 224 7 + result = tensor.reshape(tensor.size(0), height, width, tensor.size(2)) + + # Bring the channels to the first dimension, + # like in CNNs. + result = result.transpose(2, 3).transpose(1, 2) + return result + + +def swinT_transform_384(tensor, height=12, width=12): # 384 12 + result = tensor.reshape(tensor.size(0), height, width, tensor.size(2)) + + # Bring the channels to the first dimension, + # like in CNNs. + result = result.transpose(2, 3).transpose(1, 2) + return result + + +def choose_cam_by_model(model, model_idx, edge_size, use_cuda=True, model_type='CLS'): + """ + :param model: model object + :param model_idx: model idx for the getting pre-setted layer and size + :param edge_size: image size for the getting pre-setted layer and size + + :param use_cuda: use cuda to speed up imaging + :param model_type: default 'CLS' for model, 'MIL' for model backbone + """ + from pytorch_grad_cam import GradCAM + + # reshape_transform todo conformer 224!! + # check class: target_category = None + # If None, returns the map for the highest scoring category. + # Otherwise, targets the requested category. + + if model_idx[0:3] == 'ViT' or model_idx[0:4] == 'deit': + # We should chose any layer before the final attention block, + # check: https://github.com/jacobgil/pytorch-grad-cam/blob/master/tutorials/vision_transformers.md + if model_type == 'CLS': + target_layers = [model.blocks[-1].norm1] + else: # MIL-SI + target_layers = [model.backbone.blocks[-1].norm1] + + if model_idx[0:5] == 'ViT_h': + if edge_size == 224: + grad_cam = GradCAM(model, target_layers=target_layers, use_cuda=use_cuda, + reshape_transform=cls_token_s16_transform) + else: + print('ERRO in ViT_huge edge size') + return -1 + else: + if edge_size == 384: + grad_cam = GradCAM(model, target_layers=target_layers, use_cuda=use_cuda, + reshape_transform=cls_token_s24_transform) + elif edge_size == 224: + grad_cam = GradCAM(model, target_layers=target_layers, use_cuda=use_cuda, + reshape_transform=cls_token_s14_transform) + else: + print('ERRO in ViT/DeiT edge size') + return -1 + + elif model_idx[0:3] == 'vgg': + if model_type == 'CLS': + target_layers = [model.features[-1]] + else: + target_layers = [model.backbone.features[-1]] + grad_cam = GradCAM(model, target_layers=target_layers, use_cuda=use_cuda, reshape_transform=None) + + elif model_idx[0:6] == 'swin_b': + if model_type == 'CLS': + target_layers = [model.layers[-1].blocks[-1].norm1] + else: + target_layers = [model.backbone.layers[-1].blocks[-1].norm1] + if edge_size == 384: + grad_cam = GradCAM(model, target_layers=target_layers, use_cuda=use_cuda, + reshape_transform=swinT_transform_384) + elif edge_size == 224: + grad_cam = GradCAM(model, target_layers=target_layers, use_cuda=use_cuda, + reshape_transform=swinT_transform_224) + else: + print('ERRO in Swin Transformer edge size') + return -1 + + elif model_idx[0:6] == 'ResNet': + if model_type == 'CLS': + target_layers = [model.layer4[-1]] + else: + target_layers = [model.backbone.layer4[-1]] + + grad_cam = GradCAM(model, target_layers=target_layers, use_cuda=use_cuda, reshape_transform=None) # CNN: None + + elif model_idx[0:7] == 'Hybrid1' and edge_size == 384: + target_layers = [model.blocks[-1].norm1] + grad_cam = GradCAM(model, target_layers=target_layers, use_cuda=use_cuda, + reshape_transform=cls_token_s12_transform) + + elif model_idx[0:7] == 'Hybrid2' and edge_size == 384: + target_layers = [model.dec4.norm1] + + if 'CLS' in model_idx.split('_') and 'No' in model_idx.split('_'): + grad_cam = GradCAM(model, target_layers=target_layers, use_cuda=use_cuda, + reshape_transform=no_cls_token_s12_transform) + + else: + grad_cam = GradCAM(model, target_layers=target_layers, use_cuda=use_cuda, + reshape_transform=cls_token_s12_transform) + + elif model_idx[0:7] == 'Hybrid3' and edge_size == 384: + target_layers = [model.dec3.norm1] + grad_cam = GradCAM(model, target_layers=target_layers, use_cuda=use_cuda, + reshape_transform=cls_token_s24_transform) + + elif model_idx[0:9] == 'mobilenet': + if model_type == 'CLS': + target_layers = [model.blocks[-1]] + else: + target_layers = [model.backbone.blocks[-1]] + grad_cam = GradCAM(model, target_layers=target_layers, use_cuda=use_cuda, reshape_transform=None) # CNN: None + + elif model_idx[0:10] == 'ResN50_ViT' and edge_size == 384: + if model_type == 'CLS': + target_layers = [model.blocks[-1].norm1] + else: + target_layers = [model.backbone.blocks[-1].norm1] + if edge_size == 384: + grad_cam = GradCAM(model, target_layers=target_layers, use_cuda=use_cuda, + reshape_transform=cls_token_s24_transform) + elif edge_size == 224: + grad_cam = GradCAM(model, target_layers=target_layers, use_cuda=use_cuda, + reshape_transform=cls_token_s14_transform) + else: + print('ERRO in ResN50_ViT edge size') + return -1 + + elif model_idx[0:12] == 'efficientnet': + target_layers = [model.conv_head] + grad_cam = GradCAM(model, target_layers=target_layers, use_cuda=use_cuda, reshape_transform=None) # CNN: None + + + else: + print('ERRO in model_idx') + return -1 + + return grad_cam + + +def check_SAA(inputs, labels, model, model_idx, edge_size, class_names, model_type='CLS', num_images=-1, + pic_name='test', + draw_path='../imaging_results', check_all=True, unknown_GT=False, writer=None): + """ + check num_images of images and visual the models's attention area + output a pic with 2 column and rows of num_images + + :param inputs: inputs of data + :param labels: labels or the K+1 soft label of data + + :param model: model object + :param model_idx: model idx for the getting pre-setted layer and size + :param edge_size: image size for the getting pre-setted layer and size + + :param class_names: The name of classes for painting + :param model_type: default 'CLS' for model, 'MIL' for model backbone + + :param num_images: how many image u want to check, should SMALLER THAN the batchsize + :param pic_name: name of the output pic + :param draw_path: path folder for output pic + :param check_all: choose the type of checking CAM : by default False to be only on the predicted type' + True to be on all types + + :param unknown_GT: cam on unknown GT + + :param writer: attach the pic to the tensorboard backend + + :return: None + """ + from pytorch_grad_cam.utils import show_cam_on_image + + # choose checking type: false to be only on the predicted type'; true to be on all types + if check_all: + checking_type = ['ori', ] + checking_type.extend([cls for cls in range(len(class_names))]) + else: + checking_type = ['ori', 'tar'] + + # test model + was_training = model.training + model.eval() + + outputs = model(inputs) + _, preds = torch.max(outputs, 1) + + grad_cam = choose_cam_by_model(model, model_idx, edge_size, model_type=model_type) # choose model + + if num_images == -1: # auto detect a batch + num_images = int(inputs.shape[0]) + + images_so_far = 0 + plt.figure() + + for j in range(num_images): + + for type in checking_type: + images_so_far += 1 + if type == 'ori': + ax = plt.subplot(num_images, len(checking_type), images_so_far) + ax.axis('off') + + if unknown_GT and not len(labels) == 1: # Ground Truth of the K+1 soft label + soft_label = labels.cpu().numpy()[j] # K+1 soft label + title = 'A' + str(round(soft_label[0], 0)) + for i in range(1, len(soft_label)): + title += class_names[i - 1][0] # use the first character only + title += str(round(soft_label[i], 0)) # use int (float 0) + title += ' ' + ax.set_title(title) + + else: + ax.set_title('Ground Truth:{}'.format(class_names[int(labels[j])])) + + imshow(inputs.cpu().data[j]) + plt.pause(0.001) # pause a bit so that plots are updated + + else: + ax = plt.subplot(num_images, len(checking_type), images_so_far) + ax.axis('off') + if type == 'tar': # target categories + ax.set_title('Predict: {}'.format(class_names[preds[j]])) + # focus on the specific target class to create grayscale_cam + # grayscale_cam is generate on batch + grayscale_cam = grad_cam(inputs, target_category=None, eigen_smooth=False, aug_smooth=False) + else: + # pseudo confidence by softmax + ax.set_title('{:.1%} {}'.format(softmax(outputs[j])[int(type)], class_names[int(type)])) + # focus on the specific target class to create grayscale_cam + # grayscale_cam is generate on batch + grayscale_cam = grad_cam(inputs, target_category=int(type), eigen_smooth=False, aug_smooth=False) + + # get a cv2 encoding image from dataloder by inputs[j].cpu().numpy().transpose((1, 2, 0)) + + cam_img = show_cam_on_image(inputs[j].cpu().numpy().transpose((1, 2, 0)), grayscale_cam[j], + use_rgb=True) # Fixme: use rgb format input (already fixed) + + plt.imshow(cam_img) + plt.pause(0.001) # pause a bit so that plots are updated + + if images_so_far == num_images * len(checking_type): # complete when the pics is enough + picpath = os.path.join(draw_path, pic_name + '.jpg') + if not os.path.exists(draw_path): + os.makedirs(draw_path) + + plt.savefig(picpath, dpi=1000) + plt.show() + + model.train(mode=was_training) + if writer is not None: # attach the pic to the tensorboard backend if avilable + image_PIL = Image.open(picpath) + img = np.array(image_PIL) + writer.add_image(pic_name, img, 1, dataformats='HWC') + + plt.cla() + plt.close("all") + return + + model.train(mode=was_training) + + +def visualize_check(inputs, labels, model, class_names, num_images=-1, pic_name='test', + draw_path='/home/ZTY/imaging_results', writer=None): # visual check + """ + check num_images of images and visual them + output a pic with 3 column and rows of num_images//3 + + :param inputs: inputs of data + :param labels: labels of data + + :param model: model object + :param class_names: The name of classes for painting + :param num_images: how many image u want to check, should SMALLER THAN the batchsize + :param pic_name: name of the output pic + :param draw_path: path folder for output pic + :param writer: attach the pic to the tensorboard backend + + :return: None + + """ + was_training = model.training + model.eval() + + images_so_far = 0 + plt.figure() + + with torch.no_grad(): + + outputs = model(inputs) + _, preds = torch.max(outputs, 1) + + if num_images == -1: # auto detect a batch + num_images = int(inputs.shape[0]) + + if num_images % 5 == 0: + line_imgs_num = 5 + elif num_images % 4 == 0: + line_imgs_num = 4 + elif num_images % 3 == 0: + line_imgs_num = 3 + elif num_images % 2 == 0: + line_imgs_num = 2 + else: + line_imgs_num = int(num_images) + + rows_imgs_num = int(num_images // line_imgs_num) + num_images = line_imgs_num * rows_imgs_num + + for j in range(num_images): # each batch input idx: j + + images_so_far += 1 + + ax = plt.subplot(rows_imgs_num, line_imgs_num, images_so_far) + + ax.axis('off') + ax.set_title('Pred: {} True: {}'.format(class_names[preds[j]], class_names[int(labels[j])])) + imshow(inputs.cpu().data[j]) + + if images_so_far == num_images: + picpath = os.path.join(draw_path, pic_name + '.jpg') + if not os.path.exists(draw_path): + os.makedirs(draw_path) + + ''' + myfig = plt.gcf() # get current image + myfig.savefig(picpath, dpi=1000) + ''' + plt.savefig(picpath, dpi=1000) + plt.show() + + model.train(mode=was_training) + + if writer is not None: # attach the pic to the tensorboard backend if avilable + image_PIL = Image.open(picpath) + img = np.array(image_PIL) + writer.add_image(pic_name, img, 1, dataformats='HWC') + + plt.cla() + plt.close("all") + return + + model.train(mode=was_training) + + +def unpatchify(pred, patch_size=16): + """ + Decoding embeded patch tokens + + input: + x: (B, num_patches, patch_size**2 *3) AKA [B, num_patches, flatten_dim] + patch_size: + + output: + imgs: (B, 3, H, W) + """ + + # squre root of num_patches (without CLS token is required) + h = w = int(pred.shape[1] ** .5) + # assert num_patches is with out CLS token + assert h * w == pred.shape[1] + + # ReArrange dimensions [B, num_patches, flatten_dim] -> [B, h_p, w_p, patch_size, patch_size, C] + pred = pred.reshape(shape=(pred.shape[0], h, w, patch_size, patch_size, 3)) + # ReArrange dimensions [B, h_p, w_p, patch_size, patch_size, C] -> [B, C, h_p, patch_size, w_p, patch_size] + pred = torch.einsum('nhwpqc->nchpwq', pred) + # use reshape to compose patch [B, C, h_p, patch_size, w_p, patch_size] -> [B, C, H, W] + imgs = pred.reshape(shape=(pred.shape[0], 3, h * patch_size, h * patch_size)) + return imgs + + +def patchify(imgs, patch_size=16): + """ + Break image to patch tokens + + input: + imgs: (B, 3, H, W) + + output: + x: (B, num_patches, patch_size**2 *3) AKA [B, num_patches, flatten_dim] + """ + # assert H == W and image shape is dividedable by patch + assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % patch_size == 0 + # patch num in rol or column + h = w = imgs.shape[2] // patch_size + + # use reshape to split patch [B, C, H, W] -> [B, C, h_p, patch_size, w_p, patch_size] + imgs = imgs.reshape(shape=(imgs.shape[0], 3, h, patch_size, w, patch_size)) + + # ReArrange dimensions [B, C, h_p, patch_size, w_p, patch_size] -> [B, h_p, w_p, patch_size, patch_size, C] + imgs = torch.einsum('nchpwq->nhwpqc', imgs) + # ReArrange dimensions [B, h_p, w_p, patch_size, patch_size, C] -> [B, num_patches, flatten_dim] + imgs = imgs.reshape(shape=(imgs.shape[0], h * w, patch_size ** 2 * 3)) + return imgs + + +def anti_tensor_norm(batch_tensor): + pass # TODO 总之想一下