jadechoghari
commited on
Create convert.py
Browse files- convert.py +781 -0
convert.py
ADDED
@@ -0,0 +1,781 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2024 The HuggingFace Inc. team.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
"""Convert RT Detr checkpoints with Timm backbone"""
|
16 |
+
|
17 |
+
import argparse
|
18 |
+
import json
|
19 |
+
from pathlib import Path
|
20 |
+
|
21 |
+
import requests
|
22 |
+
import torch
|
23 |
+
from huggingface_hub import hf_hub_download
|
24 |
+
from PIL import Image
|
25 |
+
from torchvision import transforms
|
26 |
+
|
27 |
+
from transformers import RTDetrConfig, RTDetrForObjectDetection, RTDetrImageProcessor
|
28 |
+
from transformers.utils import logging
|
29 |
+
|
30 |
+
|
31 |
+
logging.set_verbosity_info()
|
32 |
+
logger = logging.get_logger(__name__)
|
33 |
+
|
34 |
+
|
35 |
+
def get_rt_detr_config(model_name: str) -> RTDetrConfig:
|
36 |
+
config = RTDetrConfig()
|
37 |
+
|
38 |
+
config.num_labels = 80
|
39 |
+
repo_id = "huggingface/label-files"
|
40 |
+
filename = "coco-detection-mmdet-id2label.json"
|
41 |
+
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
|
42 |
+
id2label = {int(k): v for k, v in id2label.items()}
|
43 |
+
config.id2label = id2label
|
44 |
+
config.label2id = {v: k for k, v in id2label.items()}
|
45 |
+
|
46 |
+
if model_name == "rtdetr_r18vd":
|
47 |
+
config.backbone_config.hidden_sizes = [64, 128, 256, 512]
|
48 |
+
config.backbone_config.depths = [2, 2, 2, 2]
|
49 |
+
config.backbone_config.layer_type = "basic"
|
50 |
+
config.encoder_in_channels = [128, 256, 512]
|
51 |
+
config.hidden_expansion = 0.5
|
52 |
+
config.decoder_layers = 3
|
53 |
+
elif model_name == "rtdetr_r34vd":
|
54 |
+
config.backbone_config.hidden_sizes = [64, 128, 256, 512]
|
55 |
+
config.backbone_config.depths = [3, 4, 6, 3]
|
56 |
+
config.backbone_config.layer_type = "basic"
|
57 |
+
config.encoder_in_channels = [128, 256, 512]
|
58 |
+
config.hidden_expansion = 0.5
|
59 |
+
config.decoder_layers = 4
|
60 |
+
elif model_name == "rtdetr_r50vd_m":
|
61 |
+
pass
|
62 |
+
elif model_name == "rtdetr_r50vd":
|
63 |
+
pass
|
64 |
+
elif model_name == "rtdetr_r101vd":
|
65 |
+
config.backbone_config.depths = [3, 4, 23, 3]
|
66 |
+
config.encoder_ffn_dim = 2048
|
67 |
+
config.encoder_hidden_dim = 384
|
68 |
+
config.decoder_in_channels = [384, 384, 384]
|
69 |
+
elif model_name == "rtdetr_r18vd_coco_o365":
|
70 |
+
config.backbone_config.hidden_sizes = [64, 128, 256, 512]
|
71 |
+
config.backbone_config.depths = [2, 2, 2, 2]
|
72 |
+
config.backbone_config.layer_type = "basic"
|
73 |
+
config.encoder_in_channels = [128, 256, 512]
|
74 |
+
config.hidden_expansion = 0.5
|
75 |
+
config.decoder_layers = 3
|
76 |
+
elif model_name == "rtdetr_r50vd_coco_o365":
|
77 |
+
pass
|
78 |
+
elif model_name == "rtdetr_r101vd_coco_o365":
|
79 |
+
config.backbone_config.depths = [3, 4, 23, 3]
|
80 |
+
config.encoder_ffn_dim = 2048
|
81 |
+
config.encoder_hidden_dim = 384
|
82 |
+
config.decoder_in_channels = [384, 384, 384]
|
83 |
+
|
84 |
+
return config
|
85 |
+
|
86 |
+
|
87 |
+
def create_rename_keys(config):
|
88 |
+
# here we list all keys to be renamed (original name on the left, our name on the right)
|
89 |
+
rename_keys = []
|
90 |
+
|
91 |
+
# stem
|
92 |
+
# fmt: off
|
93 |
+
last_key = ["weight", "bias", "running_mean", "running_var"]
|
94 |
+
|
95 |
+
for level in range(3):
|
96 |
+
rename_keys.append((f"backbone.conv1.conv1_{level+1}.conv.weight", f"model.backbone.model.embedder.embedder.{level}.convolution.weight"))
|
97 |
+
for last in last_key:
|
98 |
+
rename_keys.append((f"backbone.conv1.conv1_{level+1}.norm.{last}", f"model.backbone.model.embedder.embedder.{level}.normalization.{last}"))
|
99 |
+
|
100 |
+
for stage_idx in range(len(config.backbone_config.depths)):
|
101 |
+
for layer_idx in range(config.backbone_config.depths[stage_idx]):
|
102 |
+
# shortcut
|
103 |
+
if layer_idx == 0:
|
104 |
+
if stage_idx == 0:
|
105 |
+
rename_keys.append(
|
106 |
+
(
|
107 |
+
f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.weight",
|
108 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.convolution.weight",
|
109 |
+
)
|
110 |
+
)
|
111 |
+
for last in last_key:
|
112 |
+
rename_keys.append(
|
113 |
+
(
|
114 |
+
f"backbone.res_layers.{stage_idx}.blocks.0.short.norm.{last}",
|
115 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.normalization.{last}",
|
116 |
+
)
|
117 |
+
)
|
118 |
+
else:
|
119 |
+
rename_keys.append(
|
120 |
+
(
|
121 |
+
f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.conv.weight",
|
122 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.1.convolution.weight",
|
123 |
+
)
|
124 |
+
)
|
125 |
+
for last in last_key:
|
126 |
+
rename_keys.append(
|
127 |
+
(
|
128 |
+
f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.norm.{last}",
|
129 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.1.normalization.{last}",
|
130 |
+
)
|
131 |
+
)
|
132 |
+
|
133 |
+
rename_keys.append(
|
134 |
+
(
|
135 |
+
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2a.conv.weight",
|
136 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.convolution.weight",
|
137 |
+
)
|
138 |
+
)
|
139 |
+
for last in last_key:
|
140 |
+
rename_keys.append((
|
141 |
+
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2a.norm.{last}",
|
142 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.normalization.{last}",
|
143 |
+
))
|
144 |
+
|
145 |
+
rename_keys.append(
|
146 |
+
(
|
147 |
+
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2b.conv.weight",
|
148 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.convolution.weight",
|
149 |
+
)
|
150 |
+
)
|
151 |
+
for last in last_key:
|
152 |
+
rename_keys.append((
|
153 |
+
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2b.norm.{last}",
|
154 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.normalization.{last}",
|
155 |
+
))
|
156 |
+
|
157 |
+
# https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/nn/backbone/presnet.py#L171
|
158 |
+
if config.backbone_config.layer_type != "basic":
|
159 |
+
rename_keys.append(
|
160 |
+
(
|
161 |
+
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2c.conv.weight",
|
162 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.2.convolution.weight",
|
163 |
+
)
|
164 |
+
)
|
165 |
+
for last in last_key:
|
166 |
+
rename_keys.append((
|
167 |
+
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2c.norm.{last}",
|
168 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.2.normalization.{last}",
|
169 |
+
))
|
170 |
+
# fmt: on
|
171 |
+
|
172 |
+
for i in range(config.encoder_layers):
|
173 |
+
# encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
|
174 |
+
rename_keys.append(
|
175 |
+
(
|
176 |
+
f"encoder.encoder.{i}.layers.0.self_attn.out_proj.weight",
|
177 |
+
f"model.encoder.encoder.{i}.layers.0.self_attn.out_proj.weight",
|
178 |
+
)
|
179 |
+
)
|
180 |
+
rename_keys.append(
|
181 |
+
(
|
182 |
+
f"encoder.encoder.{i}.layers.0.self_attn.out_proj.bias",
|
183 |
+
f"model.encoder.encoder.{i}.layers.0.self_attn.out_proj.bias",
|
184 |
+
)
|
185 |
+
)
|
186 |
+
rename_keys.append(
|
187 |
+
(
|
188 |
+
f"encoder.encoder.{i}.layers.0.linear1.weight",
|
189 |
+
f"model.encoder.encoder.{i}.layers.0.fc1.weight",
|
190 |
+
)
|
191 |
+
)
|
192 |
+
rename_keys.append(
|
193 |
+
(
|
194 |
+
f"encoder.encoder.{i}.layers.0.linear1.bias",
|
195 |
+
f"model.encoder.encoder.{i}.layers.0.fc1.bias",
|
196 |
+
)
|
197 |
+
)
|
198 |
+
rename_keys.append(
|
199 |
+
(
|
200 |
+
f"encoder.encoder.{i}.layers.0.linear2.weight",
|
201 |
+
f"model.encoder.encoder.{i}.layers.0.fc2.weight",
|
202 |
+
)
|
203 |
+
)
|
204 |
+
rename_keys.append(
|
205 |
+
(
|
206 |
+
f"encoder.encoder.{i}.layers.0.linear2.bias",
|
207 |
+
f"model.encoder.encoder.{i}.layers.0.fc2.bias",
|
208 |
+
)
|
209 |
+
)
|
210 |
+
rename_keys.append(
|
211 |
+
(
|
212 |
+
f"encoder.encoder.{i}.layers.0.norm1.weight",
|
213 |
+
f"model.encoder.encoder.{i}.layers.0.self_attn_layer_norm.weight",
|
214 |
+
)
|
215 |
+
)
|
216 |
+
rename_keys.append(
|
217 |
+
(
|
218 |
+
f"encoder.encoder.{i}.layers.0.norm1.bias",
|
219 |
+
f"model.encoder.encoder.{i}.layers.0.self_attn_layer_norm.bias",
|
220 |
+
)
|
221 |
+
)
|
222 |
+
rename_keys.append(
|
223 |
+
(
|
224 |
+
f"encoder.encoder.{i}.layers.0.norm2.weight",
|
225 |
+
f"model.encoder.encoder.{i}.layers.0.final_layer_norm.weight",
|
226 |
+
)
|
227 |
+
)
|
228 |
+
rename_keys.append(
|
229 |
+
(
|
230 |
+
f"encoder.encoder.{i}.layers.0.norm2.bias",
|
231 |
+
f"model.encoder.encoder.{i}.layers.0.final_layer_norm.bias",
|
232 |
+
)
|
233 |
+
)
|
234 |
+
|
235 |
+
for j in range(0, 3):
|
236 |
+
rename_keys.append((f"encoder.input_proj.{j}.conv.weight", f"model.encoder_input_proj.{j}.0.weight"))
|
237 |
+
for last in last_key:
|
238 |
+
rename_keys.append((f"encoder.input_proj.{j}.norm.{last}", f"model.encoder_input_proj.{j}.1.{last}"))
|
239 |
+
|
240 |
+
block_levels = 3 if config.backbone_config.layer_type != "basic" else 4
|
241 |
+
|
242 |
+
for i in range(len(config.encoder_in_channels) - 1):
|
243 |
+
# encoder layers: hybridencoder parts
|
244 |
+
for j in range(1, block_levels):
|
245 |
+
rename_keys.append(
|
246 |
+
(f"encoder.fpn_blocks.{i}.conv{j}.conv.weight", f"model.encoder.fpn_blocks.{i}.conv{j}.conv.weight")
|
247 |
+
)
|
248 |
+
for last in last_key:
|
249 |
+
rename_keys.append(
|
250 |
+
(
|
251 |
+
f"encoder.fpn_blocks.{i}.conv{j}.norm.{last}",
|
252 |
+
f"model.encoder.fpn_blocks.{i}.conv{j}.norm.{last}",
|
253 |
+
)
|
254 |
+
)
|
255 |
+
|
256 |
+
rename_keys.append((f"encoder.lateral_convs.{i}.conv.weight", f"model.encoder.lateral_convs.{i}.conv.weight"))
|
257 |
+
for last in last_key:
|
258 |
+
rename_keys.append(
|
259 |
+
(f"encoder.lateral_convs.{i}.norm.{last}", f"model.encoder.lateral_convs.{i}.norm.{last}")
|
260 |
+
)
|
261 |
+
|
262 |
+
for j in range(3):
|
263 |
+
for k in range(1, 3):
|
264 |
+
rename_keys.append(
|
265 |
+
(
|
266 |
+
f"encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
|
267 |
+
f"model.encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
|
268 |
+
)
|
269 |
+
)
|
270 |
+
for last in last_key:
|
271 |
+
rename_keys.append(
|
272 |
+
(
|
273 |
+
f"encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
|
274 |
+
f"model.encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
|
275 |
+
)
|
276 |
+
)
|
277 |
+
|
278 |
+
for j in range(1, block_levels):
|
279 |
+
rename_keys.append(
|
280 |
+
(f"encoder.pan_blocks.{i}.conv{j}.conv.weight", f"model.encoder.pan_blocks.{i}.conv{j}.conv.weight")
|
281 |
+
)
|
282 |
+
for last in last_key:
|
283 |
+
rename_keys.append(
|
284 |
+
(
|
285 |
+
f"encoder.pan_blocks.{i}.conv{j}.norm.{last}",
|
286 |
+
f"model.encoder.pan_blocks.{i}.conv{j}.norm.{last}",
|
287 |
+
)
|
288 |
+
)
|
289 |
+
|
290 |
+
for j in range(3):
|
291 |
+
for k in range(1, 3):
|
292 |
+
rename_keys.append(
|
293 |
+
(
|
294 |
+
f"encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
|
295 |
+
f"model.encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
|
296 |
+
)
|
297 |
+
)
|
298 |
+
for last in last_key:
|
299 |
+
rename_keys.append(
|
300 |
+
(
|
301 |
+
f"encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
|
302 |
+
f"model.encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
|
303 |
+
)
|
304 |
+
)
|
305 |
+
|
306 |
+
rename_keys.append(
|
307 |
+
(f"encoder.downsample_convs.{i}.conv.weight", f"model.encoder.downsample_convs.{i}.conv.weight")
|
308 |
+
)
|
309 |
+
for last in last_key:
|
310 |
+
rename_keys.append(
|
311 |
+
(f"encoder.downsample_convs.{i}.norm.{last}", f"model.encoder.downsample_convs.{i}.norm.{last}")
|
312 |
+
)
|
313 |
+
|
314 |
+
for i in range(config.decoder_layers):
|
315 |
+
# decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
|
316 |
+
rename_keys.append(
|
317 |
+
(
|
318 |
+
f"decoder.decoder.layers.{i}.self_attn.out_proj.weight",
|
319 |
+
f"model.decoder.layers.{i}.self_attn.out_proj.weight",
|
320 |
+
)
|
321 |
+
)
|
322 |
+
rename_keys.append(
|
323 |
+
(
|
324 |
+
f"decoder.decoder.layers.{i}.self_attn.out_proj.bias",
|
325 |
+
f"model.decoder.layers.{i}.self_attn.out_proj.bias",
|
326 |
+
)
|
327 |
+
)
|
328 |
+
rename_keys.append(
|
329 |
+
(
|
330 |
+
f"decoder.decoder.layers.{i}.cross_attn.sampling_offsets.weight",
|
331 |
+
f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight",
|
332 |
+
)
|
333 |
+
)
|
334 |
+
rename_keys.append(
|
335 |
+
(
|
336 |
+
f"decoder.decoder.layers.{i}.cross_attn.sampling_offsets.bias",
|
337 |
+
f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias",
|
338 |
+
)
|
339 |
+
)
|
340 |
+
rename_keys.append(
|
341 |
+
(
|
342 |
+
f"decoder.decoder.layers.{i}.cross_attn.attention_weights.weight",
|
343 |
+
f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight",
|
344 |
+
)
|
345 |
+
)
|
346 |
+
rename_keys.append(
|
347 |
+
(
|
348 |
+
f"decoder.decoder.layers.{i}.cross_attn.attention_weights.bias",
|
349 |
+
f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias",
|
350 |
+
)
|
351 |
+
)
|
352 |
+
rename_keys.append(
|
353 |
+
(
|
354 |
+
f"decoder.decoder.layers.{i}.cross_attn.value_proj.weight",
|
355 |
+
f"model.decoder.layers.{i}.encoder_attn.value_proj.weight",
|
356 |
+
)
|
357 |
+
)
|
358 |
+
rename_keys.append(
|
359 |
+
(
|
360 |
+
f"decoder.decoder.layers.{i}.cross_attn.value_proj.bias",
|
361 |
+
f"model.decoder.layers.{i}.encoder_attn.value_proj.bias",
|
362 |
+
)
|
363 |
+
)
|
364 |
+
rename_keys.append(
|
365 |
+
(
|
366 |
+
f"decoder.decoder.layers.{i}.cross_attn.output_proj.weight",
|
367 |
+
f"model.decoder.layers.{i}.encoder_attn.output_proj.weight",
|
368 |
+
)
|
369 |
+
)
|
370 |
+
rename_keys.append(
|
371 |
+
(
|
372 |
+
f"decoder.decoder.layers.{i}.cross_attn.output_proj.bias",
|
373 |
+
f"model.decoder.layers.{i}.encoder_attn.output_proj.bias",
|
374 |
+
)
|
375 |
+
)
|
376 |
+
rename_keys.append(
|
377 |
+
(f"decoder.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight")
|
378 |
+
)
|
379 |
+
rename_keys.append(
|
380 |
+
(f"decoder.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias")
|
381 |
+
)
|
382 |
+
rename_keys.append(
|
383 |
+
(f"decoder.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight")
|
384 |
+
)
|
385 |
+
rename_keys.append(
|
386 |
+
(f"decoder.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias")
|
387 |
+
)
|
388 |
+
rename_keys.append((f"decoder.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
|
389 |
+
rename_keys.append((f"decoder.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
|
390 |
+
rename_keys.append((f"decoder.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
|
391 |
+
rename_keys.append((f"decoder.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
|
392 |
+
rename_keys.append(
|
393 |
+
(f"decoder.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight")
|
394 |
+
)
|
395 |
+
rename_keys.append(
|
396 |
+
(f"decoder.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias")
|
397 |
+
)
|
398 |
+
|
399 |
+
for i in range(config.decoder_layers):
|
400 |
+
# decoder + class and bounding box heads
|
401 |
+
rename_keys.append(
|
402 |
+
(
|
403 |
+
f"decoder.dec_score_head.{i}.weight",
|
404 |
+
f"model.decoder.class_embed.{i}.weight",
|
405 |
+
)
|
406 |
+
)
|
407 |
+
rename_keys.append(
|
408 |
+
(
|
409 |
+
f"decoder.dec_score_head.{i}.bias",
|
410 |
+
f"model.decoder.class_embed.{i}.bias",
|
411 |
+
)
|
412 |
+
)
|
413 |
+
rename_keys.append(
|
414 |
+
(
|
415 |
+
f"decoder.dec_bbox_head.{i}.layers.0.weight",
|
416 |
+
f"model.decoder.bbox_embed.{i}.layers.0.weight",
|
417 |
+
)
|
418 |
+
)
|
419 |
+
rename_keys.append(
|
420 |
+
(
|
421 |
+
f"decoder.dec_bbox_head.{i}.layers.0.bias",
|
422 |
+
f"model.decoder.bbox_embed.{i}.layers.0.bias",
|
423 |
+
)
|
424 |
+
)
|
425 |
+
rename_keys.append(
|
426 |
+
(
|
427 |
+
f"decoder.dec_bbox_head.{i}.layers.1.weight",
|
428 |
+
f"model.decoder.bbox_embed.{i}.layers.1.weight",
|
429 |
+
)
|
430 |
+
)
|
431 |
+
rename_keys.append(
|
432 |
+
(
|
433 |
+
f"decoder.dec_bbox_head.{i}.layers.1.bias",
|
434 |
+
f"model.decoder.bbox_embed.{i}.layers.1.bias",
|
435 |
+
)
|
436 |
+
)
|
437 |
+
rename_keys.append(
|
438 |
+
(
|
439 |
+
f"decoder.dec_bbox_head.{i}.layers.2.weight",
|
440 |
+
f"model.decoder.bbox_embed.{i}.layers.2.weight",
|
441 |
+
)
|
442 |
+
)
|
443 |
+
rename_keys.append(
|
444 |
+
(
|
445 |
+
f"decoder.dec_bbox_head.{i}.layers.2.bias",
|
446 |
+
f"model.decoder.bbox_embed.{i}.layers.2.bias",
|
447 |
+
)
|
448 |
+
)
|
449 |
+
|
450 |
+
# decoder projection
|
451 |
+
for i in range(len(config.decoder_in_channels)):
|
452 |
+
rename_keys.append(
|
453 |
+
(
|
454 |
+
f"decoder.input_proj.{i}.conv.weight",
|
455 |
+
f"model.decoder_input_proj.{i}.0.weight",
|
456 |
+
)
|
457 |
+
)
|
458 |
+
for last in last_key:
|
459 |
+
rename_keys.append(
|
460 |
+
(
|
461 |
+
f"decoder.input_proj.{i}.norm.{last}",
|
462 |
+
f"model.decoder_input_proj.{i}.1.{last}",
|
463 |
+
)
|
464 |
+
)
|
465 |
+
|
466 |
+
# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
|
467 |
+
rename_keys.extend(
|
468 |
+
[
|
469 |
+
("decoder.denoising_class_embed.weight", "model.denoising_class_embed.weight"),
|
470 |
+
("decoder.query_pos_head.layers.0.weight", "model.decoder.query_pos_head.layers.0.weight"),
|
471 |
+
("decoder.query_pos_head.layers.0.bias", "model.decoder.query_pos_head.layers.0.bias"),
|
472 |
+
("decoder.query_pos_head.layers.1.weight", "model.decoder.query_pos_head.layers.1.weight"),
|
473 |
+
("decoder.query_pos_head.layers.1.bias", "model.decoder.query_pos_head.layers.1.bias"),
|
474 |
+
("decoder.enc_output.proj.weight", "model.enc_output.0.weight"),
|
475 |
+
("decoder.enc_output.proj.bias", "model.enc_output.0.bias"),
|
476 |
+
("decoder.enc_output.norm.weight", "model.enc_output.1.weight"),
|
477 |
+
("decoder.enc_output.norm.bias", "model.enc_output.1.bias"),
|
478 |
+
("decoder.enc_score_head.weight", "model.enc_score_head.weight"),
|
479 |
+
("decoder.enc_score_head.bias", "model.enc_score_head.bias"),
|
480 |
+
("decoder.enc_bbox_head.layers.0.weight", "model.enc_bbox_head.layers.0.weight"),
|
481 |
+
("decoder.enc_bbox_head.layers.0.bias", "model.enc_bbox_head.layers.0.bias"),
|
482 |
+
("decoder.enc_bbox_head.layers.1.weight", "model.enc_bbox_head.layers.1.weight"),
|
483 |
+
("decoder.enc_bbox_head.layers.1.bias", "model.enc_bbox_head.layers.1.bias"),
|
484 |
+
("decoder.enc_bbox_head.layers.2.weight", "model.enc_bbox_head.layers.2.weight"),
|
485 |
+
("decoder.enc_bbox_head.layers.2.bias", "model.enc_bbox_head.layers.2.bias"),
|
486 |
+
("decoder.decoder.layers.0.cross_attn.num_points_scale", "model.decoder.layers.0.cross_attn.num_points_scale"),
|
487 |
+
("decoder.decoder.layers.1.cross_attn.num_points_scale", "model.decoder.layers.1.cross_attn.num_points_scale"),
|
488 |
+
("decoder.decoder.layers.2.cross_attn.num_points_scale", "model.decoder.layers.2.cross_attn.num_points_scale"),
|
489 |
+
("decoder.valid_mask", "model.decoder.valid_mask"),
|
490 |
+
("decoder.anchors", "model.decoder.anchors"),
|
491 |
+
]
|
492 |
+
)
|
493 |
+
|
494 |
+
return rename_keys
|
495 |
+
|
496 |
+
|
497 |
+
def rename_key(state_dict, old, new):
|
498 |
+
try:
|
499 |
+
val = state_dict.pop(old)
|
500 |
+
state_dict[new] = val
|
501 |
+
except Exception:
|
502 |
+
pass
|
503 |
+
|
504 |
+
|
505 |
+
def read_in_q_k_v(state_dict, config):
|
506 |
+
prefix = ""
|
507 |
+
encoder_hidden_dim = config.encoder_hidden_dim
|
508 |
+
|
509 |
+
# first: transformer encoder
|
510 |
+
for i in range(config.encoder_layers):
|
511 |
+
# read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
|
512 |
+
in_proj_weight = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_weight")
|
513 |
+
in_proj_bias = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_bias")
|
514 |
+
# next, add query, keys and values (in that order) to the state dict
|
515 |
+
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.weight"] = in_proj_weight[
|
516 |
+
:encoder_hidden_dim, :
|
517 |
+
]
|
518 |
+
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.bias"] = in_proj_bias[:encoder_hidden_dim]
|
519 |
+
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.weight"] = in_proj_weight[
|
520 |
+
encoder_hidden_dim : 2 * encoder_hidden_dim, :
|
521 |
+
]
|
522 |
+
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.bias"] = in_proj_bias[
|
523 |
+
encoder_hidden_dim : 2 * encoder_hidden_dim
|
524 |
+
]
|
525 |
+
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.weight"] = in_proj_weight[
|
526 |
+
-encoder_hidden_dim:, :
|
527 |
+
]
|
528 |
+
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.bias"] = in_proj_bias[-encoder_hidden_dim:]
|
529 |
+
# next: transformer decoder (which is a bit more complex because it also includes cross-attention)
|
530 |
+
for i in range(config.decoder_layers):
|
531 |
+
# read in weights + bias of input projection layer of self-attention
|
532 |
+
in_proj_weight = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_weight")
|
533 |
+
in_proj_bias = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_bias")
|
534 |
+
# next, add query, keys and values (in that order) to the state dict
|
535 |
+
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
|
536 |
+
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
|
537 |
+
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
|
538 |
+
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
|
539 |
+
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
|
540 |
+
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
|
541 |
+
|
542 |
+
|
543 |
+
# We will verify our results on an image of cute cats
|
544 |
+
def prepare_img():
|
545 |
+
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
546 |
+
im = Image.open(requests.get(url, stream=True).raw)
|
547 |
+
|
548 |
+
return im
|
549 |
+
|
550 |
+
|
551 |
+
@torch.no_grad()
|
552 |
+
def convert_rt_detr_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, repo_id):
|
553 |
+
"""
|
554 |
+
Copy/paste/tweak model's weights to our RTDETR structure.
|
555 |
+
"""
|
556 |
+
|
557 |
+
# load default config
|
558 |
+
config = get_rt_detr_config(model_name)
|
559 |
+
|
560 |
+
# load original model from torch hub
|
561 |
+
model_name_to_checkpoint_url = {
|
562 |
+
"rtdetr_r18vd": "https://github.com/lyuwenyu/storage/releases/download/v0.2/rtdetrv2_r18vd_120e_coco_rerun_48.1.pth"
|
563 |
+
}
|
564 |
+
logger.info(f"Converting model {model_name}...")
|
565 |
+
state_dict = torch.hub.load_state_dict_from_url(model_name_to_checkpoint_url[model_name], map_location="cpu")[
|
566 |
+
"ema"
|
567 |
+
]["module"]
|
568 |
+
|
569 |
+
# rename keys
|
570 |
+
for src, dest in create_rename_keys(config):
|
571 |
+
rename_key(state_dict, src, dest)
|
572 |
+
# query, key and value matrices need special treatment
|
573 |
+
read_in_q_k_v(state_dict, config)
|
574 |
+
# important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
|
575 |
+
for key in state_dict.copy().keys():
|
576 |
+
if key.endswith("num_batches_tracked"):
|
577 |
+
del state_dict[key]
|
578 |
+
# for two_stage
|
579 |
+
if "bbox_embed" in key or ("class_embed" in key and "denoising_" not in key):
|
580 |
+
state_dict[key.split("model.decoder.")[-1]] = state_dict[key]
|
581 |
+
|
582 |
+
print("done renaming now loading")
|
583 |
+
# finally, create HuggingFace model and load state dict
|
584 |
+
model = RTDetrForObjectDetection(config)
|
585 |
+
model.load_state_dict(state_dict, strict=False)
|
586 |
+
model.eval()
|
587 |
+
|
588 |
+
# load image processor
|
589 |
+
image_processor = RTDetrImageProcessor()
|
590 |
+
|
591 |
+
# prepare image
|
592 |
+
img = prepare_img()
|
593 |
+
|
594 |
+
# preprocess image
|
595 |
+
transformations = transforms.Compose(
|
596 |
+
[
|
597 |
+
transforms.Resize([640, 640], interpolation=transforms.InterpolationMode.BILINEAR),
|
598 |
+
transforms.ToTensor(),
|
599 |
+
]
|
600 |
+
)
|
601 |
+
original_pixel_values = transformations(img).unsqueeze(0) # insert batch dimension
|
602 |
+
|
603 |
+
encoding = image_processor(images=img, return_tensors="pt")
|
604 |
+
pixel_values = encoding["pixel_values"]
|
605 |
+
|
606 |
+
assert torch.allclose(original_pixel_values, pixel_values)
|
607 |
+
|
608 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
609 |
+
model.to(device)
|
610 |
+
pixel_values = pixel_values.to(device)
|
611 |
+
|
612 |
+
# Pass image by the model
|
613 |
+
outputs = model(pixel_values)
|
614 |
+
|
615 |
+
# if model_name == "rtdetr_r18vd":
|
616 |
+
# expected_slice_logits = torch.tensor(
|
617 |
+
# [
|
618 |
+
# [-4.3364253, -6.465683, -3.6130402],
|
619 |
+
# [-4.083815, -6.4039373, -6.97881],
|
620 |
+
# [-4.192215, -7.3410473, -6.9027247],
|
621 |
+
# ]
|
622 |
+
# )
|
623 |
+
# expected_slice_boxes = torch.tensor(
|
624 |
+
# [
|
625 |
+
# [0.16868353, 0.19833282, 0.21182671],
|
626 |
+
# [0.25559652, 0.55121744, 0.47988364],
|
627 |
+
# [0.7698693, 0.4124569, 0.46036878],
|
628 |
+
# ]
|
629 |
+
# )
|
630 |
+
# elif model_name == "rtdetr_r34vd":
|
631 |
+
# expected_slice_logits = torch.tensor(
|
632 |
+
# [
|
633 |
+
# [-4.3727384, -4.7921476, -5.7299604],
|
634 |
+
# [-4.840536, -8.455345, -4.1745796],
|
635 |
+
# [-4.1277084, -5.2154565, -5.7852697],
|
636 |
+
# ]
|
637 |
+
# )
|
638 |
+
# expected_slice_boxes = torch.tensor(
|
639 |
+
# [
|
640 |
+
# [0.258278, 0.5497808, 0.4732004],
|
641 |
+
# [0.16889669, 0.19890057, 0.21138911],
|
642 |
+
# [0.76632994, 0.4147879, 0.46851268],
|
643 |
+
# ]
|
644 |
+
# )
|
645 |
+
# elif model_name == "rtdetr_r50vd_m":
|
646 |
+
# expected_slice_logits = torch.tensor(
|
647 |
+
# [
|
648 |
+
# [-4.319764, -6.1349025, -6.094794],
|
649 |
+
# [-5.1056995, -7.744766, -4.803956],
|
650 |
+
# [-4.7685347, -7.9278393, -4.5751696],
|
651 |
+
# ]
|
652 |
+
# )
|
653 |
+
# expected_slice_boxes = torch.tensor(
|
654 |
+
# [
|
655 |
+
# [0.2582739, 0.55071366, 0.47660282],
|
656 |
+
# [0.16811174, 0.19954777, 0.21292639],
|
657 |
+
# [0.54986024, 0.2752091, 0.0561416],
|
658 |
+
# ]
|
659 |
+
# )
|
660 |
+
# elif model_name == "rtdetr_r50vd":
|
661 |
+
# expected_slice_logits = torch.tensor(
|
662 |
+
# [
|
663 |
+
# [-4.6476398, -5.001154, -4.9785104],
|
664 |
+
# [-4.1593494, -4.7038546, -5.946485],
|
665 |
+
# [-4.4374595, -4.658361, -6.2352347],
|
666 |
+
# ]
|
667 |
+
# )
|
668 |
+
# expected_slice_boxes = torch.tensor(
|
669 |
+
# [
|
670 |
+
# [0.16880608, 0.19992264, 0.21225442],
|
671 |
+
# [0.76837635, 0.4122631, 0.46368608],
|
672 |
+
# [0.2595386, 0.5483334, 0.4777486],
|
673 |
+
# ]
|
674 |
+
# )
|
675 |
+
# elif model_name == "rtdetr_r101vd":
|
676 |
+
# expected_slice_logits = torch.tensor(
|
677 |
+
# [
|
678 |
+
# [-4.6162, -4.9189, -4.6656],
|
679 |
+
# [-4.4701, -4.4997, -4.9659],
|
680 |
+
# [-5.6641, -7.9000, -5.0725],
|
681 |
+
# ]
|
682 |
+
# )
|
683 |
+
# expected_slice_boxes = torch.tensor(
|
684 |
+
# [
|
685 |
+
# [0.7707, 0.4124, 0.4585],
|
686 |
+
# [0.2589, 0.5492, 0.4735],
|
687 |
+
# [0.1688, 0.1993, 0.2108],
|
688 |
+
# ]
|
689 |
+
# )
|
690 |
+
# elif model_name == "rtdetr_r18vd_coco_o365":
|
691 |
+
# expected_slice_logits = torch.tensor(
|
692 |
+
# [
|
693 |
+
# [-4.8726, -5.9066, -5.2450],
|
694 |
+
# [-4.8157, -6.8764, -5.1656],
|
695 |
+
# [-4.7492, -5.7006, -5.1333],
|
696 |
+
# ]
|
697 |
+
# )
|
698 |
+
# expected_slice_boxes = torch.tensor(
|
699 |
+
# [
|
700 |
+
# [0.2552, 0.5501, 0.4773],
|
701 |
+
# [0.1685, 0.1986, 0.2104],
|
702 |
+
# [0.7692, 0.4141, 0.4620],
|
703 |
+
# ]
|
704 |
+
# )
|
705 |
+
# elif model_name == "rtdetr_r50vd_coco_o365":
|
706 |
+
# expected_slice_logits = torch.tensor(
|
707 |
+
# [
|
708 |
+
# [-4.6491, -3.9252, -5.3163],
|
709 |
+
# [-4.1386, -5.0348, -3.9016],
|
710 |
+
# [-4.4778, -4.5423, -5.7356],
|
711 |
+
# ]
|
712 |
+
# )
|
713 |
+
# expected_slice_boxes = torch.tensor(
|
714 |
+
# [
|
715 |
+
# [0.2583, 0.5492, 0.4747],
|
716 |
+
# [0.5501, 0.2754, 0.0574],
|
717 |
+
# [0.7693, 0.4137, 0.4613],
|
718 |
+
# ]
|
719 |
+
# )
|
720 |
+
# elif model_name == "rtdetr_r101vd_coco_o365":
|
721 |
+
# expected_slice_logits = torch.tensor(
|
722 |
+
# [
|
723 |
+
# [-4.5152, -5.6811, -5.7311],
|
724 |
+
# [-4.5358, -7.2422, -5.0941],
|
725 |
+
# [-4.6919, -5.5834, -6.0145],
|
726 |
+
# ]
|
727 |
+
# )
|
728 |
+
# expected_slice_boxes = torch.tensor(
|
729 |
+
# [
|
730 |
+
# [0.7703, 0.4140, 0.4583],
|
731 |
+
# [0.1686, 0.1991, 0.2107],
|
732 |
+
# [0.2570, 0.5496, 0.4750],
|
733 |
+
# ]
|
734 |
+
# )
|
735 |
+
# else:
|
736 |
+
# raise ValueError(f"Unknown rt_detr_name: {model_name}")
|
737 |
+
|
738 |
+
# assert torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits.to(outputs.logits.device), atol=1e-4)
|
739 |
+
# assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes.to(outputs.pred_boxes.device), atol=1e-3)
|
740 |
+
|
741 |
+
if pytorch_dump_folder_path is not None:
|
742 |
+
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
|
743 |
+
print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
|
744 |
+
model.save_pretrained(pytorch_dump_folder_path)
|
745 |
+
print(f"Saving image processor to {pytorch_dump_folder_path}")
|
746 |
+
image_processor.save_pretrained(pytorch_dump_folder_path)
|
747 |
+
|
748 |
+
if push_to_hub:
|
749 |
+
# Upload model, image processor and config to the hub
|
750 |
+
logger.info("Uploading PyTorch model and image processor to the hub...")
|
751 |
+
config.push_to_hub(
|
752 |
+
repo_id=repo_id, commit_message="Add config from convert_rt_detr_original_pytorch_checkpoint_to_pytorch.py"
|
753 |
+
)
|
754 |
+
model.push_to_hub(
|
755 |
+
repo_id=repo_id, commit_message="Add model from convert_rt_detr_original_pytorch_checkpoint_to_pytorch.py"
|
756 |
+
)
|
757 |
+
image_processor.push_to_hub(
|
758 |
+
repo_id=repo_id,
|
759 |
+
commit_message="Add image processor from convert_rt_detr_original_pytorch_checkpoint_to_pytorch.py",
|
760 |
+
)
|
761 |
+
|
762 |
+
|
763 |
+
if __name__ == "__main__":
|
764 |
+
parser = argparse.ArgumentParser()
|
765 |
+
parser.add_argument(
|
766 |
+
"--model_name",
|
767 |
+
default="rtdetr_r50vd",
|
768 |
+
type=str,
|
769 |
+
help="model_name of the checkpoint you'd like to convert.",
|
770 |
+
)
|
771 |
+
parser.add_argument(
|
772 |
+
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
773 |
+
)
|
774 |
+
parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.")
|
775 |
+
parser.add_argument(
|
776 |
+
"--repo_id",
|
777 |
+
type=str,
|
778 |
+
help="repo_id where the model will be pushed to.",
|
779 |
+
)
|
780 |
+
args = parser.parse_args()
|
781 |
+
convert_rt_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.repo_id)
|