jadechoghari commited on
Commit
73577d8
·
verified ·
1 Parent(s): 09fb28b

Create convert.py

Browse files
Files changed (1) hide show
  1. convert.py +781 -0
convert.py ADDED
@@ -0,0 +1,781 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Convert RT Detr checkpoints with Timm backbone"""
16
+
17
+ import argparse
18
+ import json
19
+ from pathlib import Path
20
+
21
+ import requests
22
+ import torch
23
+ from huggingface_hub import hf_hub_download
24
+ from PIL import Image
25
+ from torchvision import transforms
26
+
27
+ from transformers import RTDetrConfig, RTDetrForObjectDetection, RTDetrImageProcessor
28
+ from transformers.utils import logging
29
+
30
+
31
+ logging.set_verbosity_info()
32
+ logger = logging.get_logger(__name__)
33
+
34
+
35
+ def get_rt_detr_config(model_name: str) -> RTDetrConfig:
36
+ config = RTDetrConfig()
37
+
38
+ config.num_labels = 80
39
+ repo_id = "huggingface/label-files"
40
+ filename = "coco-detection-mmdet-id2label.json"
41
+ id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
42
+ id2label = {int(k): v for k, v in id2label.items()}
43
+ config.id2label = id2label
44
+ config.label2id = {v: k for k, v in id2label.items()}
45
+
46
+ if model_name == "rtdetr_r18vd":
47
+ config.backbone_config.hidden_sizes = [64, 128, 256, 512]
48
+ config.backbone_config.depths = [2, 2, 2, 2]
49
+ config.backbone_config.layer_type = "basic"
50
+ config.encoder_in_channels = [128, 256, 512]
51
+ config.hidden_expansion = 0.5
52
+ config.decoder_layers = 3
53
+ elif model_name == "rtdetr_r34vd":
54
+ config.backbone_config.hidden_sizes = [64, 128, 256, 512]
55
+ config.backbone_config.depths = [3, 4, 6, 3]
56
+ config.backbone_config.layer_type = "basic"
57
+ config.encoder_in_channels = [128, 256, 512]
58
+ config.hidden_expansion = 0.5
59
+ config.decoder_layers = 4
60
+ elif model_name == "rtdetr_r50vd_m":
61
+ pass
62
+ elif model_name == "rtdetr_r50vd":
63
+ pass
64
+ elif model_name == "rtdetr_r101vd":
65
+ config.backbone_config.depths = [3, 4, 23, 3]
66
+ config.encoder_ffn_dim = 2048
67
+ config.encoder_hidden_dim = 384
68
+ config.decoder_in_channels = [384, 384, 384]
69
+ elif model_name == "rtdetr_r18vd_coco_o365":
70
+ config.backbone_config.hidden_sizes = [64, 128, 256, 512]
71
+ config.backbone_config.depths = [2, 2, 2, 2]
72
+ config.backbone_config.layer_type = "basic"
73
+ config.encoder_in_channels = [128, 256, 512]
74
+ config.hidden_expansion = 0.5
75
+ config.decoder_layers = 3
76
+ elif model_name == "rtdetr_r50vd_coco_o365":
77
+ pass
78
+ elif model_name == "rtdetr_r101vd_coco_o365":
79
+ config.backbone_config.depths = [3, 4, 23, 3]
80
+ config.encoder_ffn_dim = 2048
81
+ config.encoder_hidden_dim = 384
82
+ config.decoder_in_channels = [384, 384, 384]
83
+
84
+ return config
85
+
86
+
87
+ def create_rename_keys(config):
88
+ # here we list all keys to be renamed (original name on the left, our name on the right)
89
+ rename_keys = []
90
+
91
+ # stem
92
+ # fmt: off
93
+ last_key = ["weight", "bias", "running_mean", "running_var"]
94
+
95
+ for level in range(3):
96
+ rename_keys.append((f"backbone.conv1.conv1_{level+1}.conv.weight", f"model.backbone.model.embedder.embedder.{level}.convolution.weight"))
97
+ for last in last_key:
98
+ rename_keys.append((f"backbone.conv1.conv1_{level+1}.norm.{last}", f"model.backbone.model.embedder.embedder.{level}.normalization.{last}"))
99
+
100
+ for stage_idx in range(len(config.backbone_config.depths)):
101
+ for layer_idx in range(config.backbone_config.depths[stage_idx]):
102
+ # shortcut
103
+ if layer_idx == 0:
104
+ if stage_idx == 0:
105
+ rename_keys.append(
106
+ (
107
+ f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.weight",
108
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.convolution.weight",
109
+ )
110
+ )
111
+ for last in last_key:
112
+ rename_keys.append(
113
+ (
114
+ f"backbone.res_layers.{stage_idx}.blocks.0.short.norm.{last}",
115
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.normalization.{last}",
116
+ )
117
+ )
118
+ else:
119
+ rename_keys.append(
120
+ (
121
+ f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.conv.weight",
122
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.1.convolution.weight",
123
+ )
124
+ )
125
+ for last in last_key:
126
+ rename_keys.append(
127
+ (
128
+ f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.norm.{last}",
129
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.1.normalization.{last}",
130
+ )
131
+ )
132
+
133
+ rename_keys.append(
134
+ (
135
+ f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2a.conv.weight",
136
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.convolution.weight",
137
+ )
138
+ )
139
+ for last in last_key:
140
+ rename_keys.append((
141
+ f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2a.norm.{last}",
142
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.normalization.{last}",
143
+ ))
144
+
145
+ rename_keys.append(
146
+ (
147
+ f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2b.conv.weight",
148
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.convolution.weight",
149
+ )
150
+ )
151
+ for last in last_key:
152
+ rename_keys.append((
153
+ f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2b.norm.{last}",
154
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.normalization.{last}",
155
+ ))
156
+
157
+ # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/nn/backbone/presnet.py#L171
158
+ if config.backbone_config.layer_type != "basic":
159
+ rename_keys.append(
160
+ (
161
+ f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2c.conv.weight",
162
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.2.convolution.weight",
163
+ )
164
+ )
165
+ for last in last_key:
166
+ rename_keys.append((
167
+ f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2c.norm.{last}",
168
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.2.normalization.{last}",
169
+ ))
170
+ # fmt: on
171
+
172
+ for i in range(config.encoder_layers):
173
+ # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
174
+ rename_keys.append(
175
+ (
176
+ f"encoder.encoder.{i}.layers.0.self_attn.out_proj.weight",
177
+ f"model.encoder.encoder.{i}.layers.0.self_attn.out_proj.weight",
178
+ )
179
+ )
180
+ rename_keys.append(
181
+ (
182
+ f"encoder.encoder.{i}.layers.0.self_attn.out_proj.bias",
183
+ f"model.encoder.encoder.{i}.layers.0.self_attn.out_proj.bias",
184
+ )
185
+ )
186
+ rename_keys.append(
187
+ (
188
+ f"encoder.encoder.{i}.layers.0.linear1.weight",
189
+ f"model.encoder.encoder.{i}.layers.0.fc1.weight",
190
+ )
191
+ )
192
+ rename_keys.append(
193
+ (
194
+ f"encoder.encoder.{i}.layers.0.linear1.bias",
195
+ f"model.encoder.encoder.{i}.layers.0.fc1.bias",
196
+ )
197
+ )
198
+ rename_keys.append(
199
+ (
200
+ f"encoder.encoder.{i}.layers.0.linear2.weight",
201
+ f"model.encoder.encoder.{i}.layers.0.fc2.weight",
202
+ )
203
+ )
204
+ rename_keys.append(
205
+ (
206
+ f"encoder.encoder.{i}.layers.0.linear2.bias",
207
+ f"model.encoder.encoder.{i}.layers.0.fc2.bias",
208
+ )
209
+ )
210
+ rename_keys.append(
211
+ (
212
+ f"encoder.encoder.{i}.layers.0.norm1.weight",
213
+ f"model.encoder.encoder.{i}.layers.0.self_attn_layer_norm.weight",
214
+ )
215
+ )
216
+ rename_keys.append(
217
+ (
218
+ f"encoder.encoder.{i}.layers.0.norm1.bias",
219
+ f"model.encoder.encoder.{i}.layers.0.self_attn_layer_norm.bias",
220
+ )
221
+ )
222
+ rename_keys.append(
223
+ (
224
+ f"encoder.encoder.{i}.layers.0.norm2.weight",
225
+ f"model.encoder.encoder.{i}.layers.0.final_layer_norm.weight",
226
+ )
227
+ )
228
+ rename_keys.append(
229
+ (
230
+ f"encoder.encoder.{i}.layers.0.norm2.bias",
231
+ f"model.encoder.encoder.{i}.layers.0.final_layer_norm.bias",
232
+ )
233
+ )
234
+
235
+ for j in range(0, 3):
236
+ rename_keys.append((f"encoder.input_proj.{j}.conv.weight", f"model.encoder_input_proj.{j}.0.weight"))
237
+ for last in last_key:
238
+ rename_keys.append((f"encoder.input_proj.{j}.norm.{last}", f"model.encoder_input_proj.{j}.1.{last}"))
239
+
240
+ block_levels = 3 if config.backbone_config.layer_type != "basic" else 4
241
+
242
+ for i in range(len(config.encoder_in_channels) - 1):
243
+ # encoder layers: hybridencoder parts
244
+ for j in range(1, block_levels):
245
+ rename_keys.append(
246
+ (f"encoder.fpn_blocks.{i}.conv{j}.conv.weight", f"model.encoder.fpn_blocks.{i}.conv{j}.conv.weight")
247
+ )
248
+ for last in last_key:
249
+ rename_keys.append(
250
+ (
251
+ f"encoder.fpn_blocks.{i}.conv{j}.norm.{last}",
252
+ f"model.encoder.fpn_blocks.{i}.conv{j}.norm.{last}",
253
+ )
254
+ )
255
+
256
+ rename_keys.append((f"encoder.lateral_convs.{i}.conv.weight", f"model.encoder.lateral_convs.{i}.conv.weight"))
257
+ for last in last_key:
258
+ rename_keys.append(
259
+ (f"encoder.lateral_convs.{i}.norm.{last}", f"model.encoder.lateral_convs.{i}.norm.{last}")
260
+ )
261
+
262
+ for j in range(3):
263
+ for k in range(1, 3):
264
+ rename_keys.append(
265
+ (
266
+ f"encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
267
+ f"model.encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
268
+ )
269
+ )
270
+ for last in last_key:
271
+ rename_keys.append(
272
+ (
273
+ f"encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
274
+ f"model.encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
275
+ )
276
+ )
277
+
278
+ for j in range(1, block_levels):
279
+ rename_keys.append(
280
+ (f"encoder.pan_blocks.{i}.conv{j}.conv.weight", f"model.encoder.pan_blocks.{i}.conv{j}.conv.weight")
281
+ )
282
+ for last in last_key:
283
+ rename_keys.append(
284
+ (
285
+ f"encoder.pan_blocks.{i}.conv{j}.norm.{last}",
286
+ f"model.encoder.pan_blocks.{i}.conv{j}.norm.{last}",
287
+ )
288
+ )
289
+
290
+ for j in range(3):
291
+ for k in range(1, 3):
292
+ rename_keys.append(
293
+ (
294
+ f"encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
295
+ f"model.encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
296
+ )
297
+ )
298
+ for last in last_key:
299
+ rename_keys.append(
300
+ (
301
+ f"encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
302
+ f"model.encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
303
+ )
304
+ )
305
+
306
+ rename_keys.append(
307
+ (f"encoder.downsample_convs.{i}.conv.weight", f"model.encoder.downsample_convs.{i}.conv.weight")
308
+ )
309
+ for last in last_key:
310
+ rename_keys.append(
311
+ (f"encoder.downsample_convs.{i}.norm.{last}", f"model.encoder.downsample_convs.{i}.norm.{last}")
312
+ )
313
+
314
+ for i in range(config.decoder_layers):
315
+ # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
316
+ rename_keys.append(
317
+ (
318
+ f"decoder.decoder.layers.{i}.self_attn.out_proj.weight",
319
+ f"model.decoder.layers.{i}.self_attn.out_proj.weight",
320
+ )
321
+ )
322
+ rename_keys.append(
323
+ (
324
+ f"decoder.decoder.layers.{i}.self_attn.out_proj.bias",
325
+ f"model.decoder.layers.{i}.self_attn.out_proj.bias",
326
+ )
327
+ )
328
+ rename_keys.append(
329
+ (
330
+ f"decoder.decoder.layers.{i}.cross_attn.sampling_offsets.weight",
331
+ f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight",
332
+ )
333
+ )
334
+ rename_keys.append(
335
+ (
336
+ f"decoder.decoder.layers.{i}.cross_attn.sampling_offsets.bias",
337
+ f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias",
338
+ )
339
+ )
340
+ rename_keys.append(
341
+ (
342
+ f"decoder.decoder.layers.{i}.cross_attn.attention_weights.weight",
343
+ f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight",
344
+ )
345
+ )
346
+ rename_keys.append(
347
+ (
348
+ f"decoder.decoder.layers.{i}.cross_attn.attention_weights.bias",
349
+ f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias",
350
+ )
351
+ )
352
+ rename_keys.append(
353
+ (
354
+ f"decoder.decoder.layers.{i}.cross_attn.value_proj.weight",
355
+ f"model.decoder.layers.{i}.encoder_attn.value_proj.weight",
356
+ )
357
+ )
358
+ rename_keys.append(
359
+ (
360
+ f"decoder.decoder.layers.{i}.cross_attn.value_proj.bias",
361
+ f"model.decoder.layers.{i}.encoder_attn.value_proj.bias",
362
+ )
363
+ )
364
+ rename_keys.append(
365
+ (
366
+ f"decoder.decoder.layers.{i}.cross_attn.output_proj.weight",
367
+ f"model.decoder.layers.{i}.encoder_attn.output_proj.weight",
368
+ )
369
+ )
370
+ rename_keys.append(
371
+ (
372
+ f"decoder.decoder.layers.{i}.cross_attn.output_proj.bias",
373
+ f"model.decoder.layers.{i}.encoder_attn.output_proj.bias",
374
+ )
375
+ )
376
+ rename_keys.append(
377
+ (f"decoder.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight")
378
+ )
379
+ rename_keys.append(
380
+ (f"decoder.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias")
381
+ )
382
+ rename_keys.append(
383
+ (f"decoder.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight")
384
+ )
385
+ rename_keys.append(
386
+ (f"decoder.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias")
387
+ )
388
+ rename_keys.append((f"decoder.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
389
+ rename_keys.append((f"decoder.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
390
+ rename_keys.append((f"decoder.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
391
+ rename_keys.append((f"decoder.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
392
+ rename_keys.append(
393
+ (f"decoder.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight")
394
+ )
395
+ rename_keys.append(
396
+ (f"decoder.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias")
397
+ )
398
+
399
+ for i in range(config.decoder_layers):
400
+ # decoder + class and bounding box heads
401
+ rename_keys.append(
402
+ (
403
+ f"decoder.dec_score_head.{i}.weight",
404
+ f"model.decoder.class_embed.{i}.weight",
405
+ )
406
+ )
407
+ rename_keys.append(
408
+ (
409
+ f"decoder.dec_score_head.{i}.bias",
410
+ f"model.decoder.class_embed.{i}.bias",
411
+ )
412
+ )
413
+ rename_keys.append(
414
+ (
415
+ f"decoder.dec_bbox_head.{i}.layers.0.weight",
416
+ f"model.decoder.bbox_embed.{i}.layers.0.weight",
417
+ )
418
+ )
419
+ rename_keys.append(
420
+ (
421
+ f"decoder.dec_bbox_head.{i}.layers.0.bias",
422
+ f"model.decoder.bbox_embed.{i}.layers.0.bias",
423
+ )
424
+ )
425
+ rename_keys.append(
426
+ (
427
+ f"decoder.dec_bbox_head.{i}.layers.1.weight",
428
+ f"model.decoder.bbox_embed.{i}.layers.1.weight",
429
+ )
430
+ )
431
+ rename_keys.append(
432
+ (
433
+ f"decoder.dec_bbox_head.{i}.layers.1.bias",
434
+ f"model.decoder.bbox_embed.{i}.layers.1.bias",
435
+ )
436
+ )
437
+ rename_keys.append(
438
+ (
439
+ f"decoder.dec_bbox_head.{i}.layers.2.weight",
440
+ f"model.decoder.bbox_embed.{i}.layers.2.weight",
441
+ )
442
+ )
443
+ rename_keys.append(
444
+ (
445
+ f"decoder.dec_bbox_head.{i}.layers.2.bias",
446
+ f"model.decoder.bbox_embed.{i}.layers.2.bias",
447
+ )
448
+ )
449
+
450
+ # decoder projection
451
+ for i in range(len(config.decoder_in_channels)):
452
+ rename_keys.append(
453
+ (
454
+ f"decoder.input_proj.{i}.conv.weight",
455
+ f"model.decoder_input_proj.{i}.0.weight",
456
+ )
457
+ )
458
+ for last in last_key:
459
+ rename_keys.append(
460
+ (
461
+ f"decoder.input_proj.{i}.norm.{last}",
462
+ f"model.decoder_input_proj.{i}.1.{last}",
463
+ )
464
+ )
465
+
466
+ # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
467
+ rename_keys.extend(
468
+ [
469
+ ("decoder.denoising_class_embed.weight", "model.denoising_class_embed.weight"),
470
+ ("decoder.query_pos_head.layers.0.weight", "model.decoder.query_pos_head.layers.0.weight"),
471
+ ("decoder.query_pos_head.layers.0.bias", "model.decoder.query_pos_head.layers.0.bias"),
472
+ ("decoder.query_pos_head.layers.1.weight", "model.decoder.query_pos_head.layers.1.weight"),
473
+ ("decoder.query_pos_head.layers.1.bias", "model.decoder.query_pos_head.layers.1.bias"),
474
+ ("decoder.enc_output.proj.weight", "model.enc_output.0.weight"),
475
+ ("decoder.enc_output.proj.bias", "model.enc_output.0.bias"),
476
+ ("decoder.enc_output.norm.weight", "model.enc_output.1.weight"),
477
+ ("decoder.enc_output.norm.bias", "model.enc_output.1.bias"),
478
+ ("decoder.enc_score_head.weight", "model.enc_score_head.weight"),
479
+ ("decoder.enc_score_head.bias", "model.enc_score_head.bias"),
480
+ ("decoder.enc_bbox_head.layers.0.weight", "model.enc_bbox_head.layers.0.weight"),
481
+ ("decoder.enc_bbox_head.layers.0.bias", "model.enc_bbox_head.layers.0.bias"),
482
+ ("decoder.enc_bbox_head.layers.1.weight", "model.enc_bbox_head.layers.1.weight"),
483
+ ("decoder.enc_bbox_head.layers.1.bias", "model.enc_bbox_head.layers.1.bias"),
484
+ ("decoder.enc_bbox_head.layers.2.weight", "model.enc_bbox_head.layers.2.weight"),
485
+ ("decoder.enc_bbox_head.layers.2.bias", "model.enc_bbox_head.layers.2.bias"),
486
+ ("decoder.decoder.layers.0.cross_attn.num_points_scale", "model.decoder.layers.0.cross_attn.num_points_scale"),
487
+ ("decoder.decoder.layers.1.cross_attn.num_points_scale", "model.decoder.layers.1.cross_attn.num_points_scale"),
488
+ ("decoder.decoder.layers.2.cross_attn.num_points_scale", "model.decoder.layers.2.cross_attn.num_points_scale"),
489
+ ("decoder.valid_mask", "model.decoder.valid_mask"),
490
+ ("decoder.anchors", "model.decoder.anchors"),
491
+ ]
492
+ )
493
+
494
+ return rename_keys
495
+
496
+
497
+ def rename_key(state_dict, old, new):
498
+ try:
499
+ val = state_dict.pop(old)
500
+ state_dict[new] = val
501
+ except Exception:
502
+ pass
503
+
504
+
505
+ def read_in_q_k_v(state_dict, config):
506
+ prefix = ""
507
+ encoder_hidden_dim = config.encoder_hidden_dim
508
+
509
+ # first: transformer encoder
510
+ for i in range(config.encoder_layers):
511
+ # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
512
+ in_proj_weight = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_weight")
513
+ in_proj_bias = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_bias")
514
+ # next, add query, keys and values (in that order) to the state dict
515
+ state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.weight"] = in_proj_weight[
516
+ :encoder_hidden_dim, :
517
+ ]
518
+ state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.bias"] = in_proj_bias[:encoder_hidden_dim]
519
+ state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.weight"] = in_proj_weight[
520
+ encoder_hidden_dim : 2 * encoder_hidden_dim, :
521
+ ]
522
+ state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.bias"] = in_proj_bias[
523
+ encoder_hidden_dim : 2 * encoder_hidden_dim
524
+ ]
525
+ state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.weight"] = in_proj_weight[
526
+ -encoder_hidden_dim:, :
527
+ ]
528
+ state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.bias"] = in_proj_bias[-encoder_hidden_dim:]
529
+ # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
530
+ for i in range(config.decoder_layers):
531
+ # read in weights + bias of input projection layer of self-attention
532
+ in_proj_weight = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_weight")
533
+ in_proj_bias = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_bias")
534
+ # next, add query, keys and values (in that order) to the state dict
535
+ state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
536
+ state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
537
+ state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
538
+ state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
539
+ state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
540
+ state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
541
+
542
+
543
+ # We will verify our results on an image of cute cats
544
+ def prepare_img():
545
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
546
+ im = Image.open(requests.get(url, stream=True).raw)
547
+
548
+ return im
549
+
550
+
551
+ @torch.no_grad()
552
+ def convert_rt_detr_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, repo_id):
553
+ """
554
+ Copy/paste/tweak model's weights to our RTDETR structure.
555
+ """
556
+
557
+ # load default config
558
+ config = get_rt_detr_config(model_name)
559
+
560
+ # load original model from torch hub
561
+ model_name_to_checkpoint_url = {
562
+ "rtdetr_r18vd": "https://github.com/lyuwenyu/storage/releases/download/v0.2/rtdetrv2_r18vd_120e_coco_rerun_48.1.pth"
563
+ }
564
+ logger.info(f"Converting model {model_name}...")
565
+ state_dict = torch.hub.load_state_dict_from_url(model_name_to_checkpoint_url[model_name], map_location="cpu")[
566
+ "ema"
567
+ ]["module"]
568
+
569
+ # rename keys
570
+ for src, dest in create_rename_keys(config):
571
+ rename_key(state_dict, src, dest)
572
+ # query, key and value matrices need special treatment
573
+ read_in_q_k_v(state_dict, config)
574
+ # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
575
+ for key in state_dict.copy().keys():
576
+ if key.endswith("num_batches_tracked"):
577
+ del state_dict[key]
578
+ # for two_stage
579
+ if "bbox_embed" in key or ("class_embed" in key and "denoising_" not in key):
580
+ state_dict[key.split("model.decoder.")[-1]] = state_dict[key]
581
+
582
+ print("done renaming now loading")
583
+ # finally, create HuggingFace model and load state dict
584
+ model = RTDetrForObjectDetection(config)
585
+ model.load_state_dict(state_dict, strict=False)
586
+ model.eval()
587
+
588
+ # load image processor
589
+ image_processor = RTDetrImageProcessor()
590
+
591
+ # prepare image
592
+ img = prepare_img()
593
+
594
+ # preprocess image
595
+ transformations = transforms.Compose(
596
+ [
597
+ transforms.Resize([640, 640], interpolation=transforms.InterpolationMode.BILINEAR),
598
+ transforms.ToTensor(),
599
+ ]
600
+ )
601
+ original_pixel_values = transformations(img).unsqueeze(0) # insert batch dimension
602
+
603
+ encoding = image_processor(images=img, return_tensors="pt")
604
+ pixel_values = encoding["pixel_values"]
605
+
606
+ assert torch.allclose(original_pixel_values, pixel_values)
607
+
608
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
609
+ model.to(device)
610
+ pixel_values = pixel_values.to(device)
611
+
612
+ # Pass image by the model
613
+ outputs = model(pixel_values)
614
+
615
+ # if model_name == "rtdetr_r18vd":
616
+ # expected_slice_logits = torch.tensor(
617
+ # [
618
+ # [-4.3364253, -6.465683, -3.6130402],
619
+ # [-4.083815, -6.4039373, -6.97881],
620
+ # [-4.192215, -7.3410473, -6.9027247],
621
+ # ]
622
+ # )
623
+ # expected_slice_boxes = torch.tensor(
624
+ # [
625
+ # [0.16868353, 0.19833282, 0.21182671],
626
+ # [0.25559652, 0.55121744, 0.47988364],
627
+ # [0.7698693, 0.4124569, 0.46036878],
628
+ # ]
629
+ # )
630
+ # elif model_name == "rtdetr_r34vd":
631
+ # expected_slice_logits = torch.tensor(
632
+ # [
633
+ # [-4.3727384, -4.7921476, -5.7299604],
634
+ # [-4.840536, -8.455345, -4.1745796],
635
+ # [-4.1277084, -5.2154565, -5.7852697],
636
+ # ]
637
+ # )
638
+ # expected_slice_boxes = torch.tensor(
639
+ # [
640
+ # [0.258278, 0.5497808, 0.4732004],
641
+ # [0.16889669, 0.19890057, 0.21138911],
642
+ # [0.76632994, 0.4147879, 0.46851268],
643
+ # ]
644
+ # )
645
+ # elif model_name == "rtdetr_r50vd_m":
646
+ # expected_slice_logits = torch.tensor(
647
+ # [
648
+ # [-4.319764, -6.1349025, -6.094794],
649
+ # [-5.1056995, -7.744766, -4.803956],
650
+ # [-4.7685347, -7.9278393, -4.5751696],
651
+ # ]
652
+ # )
653
+ # expected_slice_boxes = torch.tensor(
654
+ # [
655
+ # [0.2582739, 0.55071366, 0.47660282],
656
+ # [0.16811174, 0.19954777, 0.21292639],
657
+ # [0.54986024, 0.2752091, 0.0561416],
658
+ # ]
659
+ # )
660
+ # elif model_name == "rtdetr_r50vd":
661
+ # expected_slice_logits = torch.tensor(
662
+ # [
663
+ # [-4.6476398, -5.001154, -4.9785104],
664
+ # [-4.1593494, -4.7038546, -5.946485],
665
+ # [-4.4374595, -4.658361, -6.2352347],
666
+ # ]
667
+ # )
668
+ # expected_slice_boxes = torch.tensor(
669
+ # [
670
+ # [0.16880608, 0.19992264, 0.21225442],
671
+ # [0.76837635, 0.4122631, 0.46368608],
672
+ # [0.2595386, 0.5483334, 0.4777486],
673
+ # ]
674
+ # )
675
+ # elif model_name == "rtdetr_r101vd":
676
+ # expected_slice_logits = torch.tensor(
677
+ # [
678
+ # [-4.6162, -4.9189, -4.6656],
679
+ # [-4.4701, -4.4997, -4.9659],
680
+ # [-5.6641, -7.9000, -5.0725],
681
+ # ]
682
+ # )
683
+ # expected_slice_boxes = torch.tensor(
684
+ # [
685
+ # [0.7707, 0.4124, 0.4585],
686
+ # [0.2589, 0.5492, 0.4735],
687
+ # [0.1688, 0.1993, 0.2108],
688
+ # ]
689
+ # )
690
+ # elif model_name == "rtdetr_r18vd_coco_o365":
691
+ # expected_slice_logits = torch.tensor(
692
+ # [
693
+ # [-4.8726, -5.9066, -5.2450],
694
+ # [-4.8157, -6.8764, -5.1656],
695
+ # [-4.7492, -5.7006, -5.1333],
696
+ # ]
697
+ # )
698
+ # expected_slice_boxes = torch.tensor(
699
+ # [
700
+ # [0.2552, 0.5501, 0.4773],
701
+ # [0.1685, 0.1986, 0.2104],
702
+ # [0.7692, 0.4141, 0.4620],
703
+ # ]
704
+ # )
705
+ # elif model_name == "rtdetr_r50vd_coco_o365":
706
+ # expected_slice_logits = torch.tensor(
707
+ # [
708
+ # [-4.6491, -3.9252, -5.3163],
709
+ # [-4.1386, -5.0348, -3.9016],
710
+ # [-4.4778, -4.5423, -5.7356],
711
+ # ]
712
+ # )
713
+ # expected_slice_boxes = torch.tensor(
714
+ # [
715
+ # [0.2583, 0.5492, 0.4747],
716
+ # [0.5501, 0.2754, 0.0574],
717
+ # [0.7693, 0.4137, 0.4613],
718
+ # ]
719
+ # )
720
+ # elif model_name == "rtdetr_r101vd_coco_o365":
721
+ # expected_slice_logits = torch.tensor(
722
+ # [
723
+ # [-4.5152, -5.6811, -5.7311],
724
+ # [-4.5358, -7.2422, -5.0941],
725
+ # [-4.6919, -5.5834, -6.0145],
726
+ # ]
727
+ # )
728
+ # expected_slice_boxes = torch.tensor(
729
+ # [
730
+ # [0.7703, 0.4140, 0.4583],
731
+ # [0.1686, 0.1991, 0.2107],
732
+ # [0.2570, 0.5496, 0.4750],
733
+ # ]
734
+ # )
735
+ # else:
736
+ # raise ValueError(f"Unknown rt_detr_name: {model_name}")
737
+
738
+ # assert torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits.to(outputs.logits.device), atol=1e-4)
739
+ # assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes.to(outputs.pred_boxes.device), atol=1e-3)
740
+
741
+ if pytorch_dump_folder_path is not None:
742
+ Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
743
+ print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
744
+ model.save_pretrained(pytorch_dump_folder_path)
745
+ print(f"Saving image processor to {pytorch_dump_folder_path}")
746
+ image_processor.save_pretrained(pytorch_dump_folder_path)
747
+
748
+ if push_to_hub:
749
+ # Upload model, image processor and config to the hub
750
+ logger.info("Uploading PyTorch model and image processor to the hub...")
751
+ config.push_to_hub(
752
+ repo_id=repo_id, commit_message="Add config from convert_rt_detr_original_pytorch_checkpoint_to_pytorch.py"
753
+ )
754
+ model.push_to_hub(
755
+ repo_id=repo_id, commit_message="Add model from convert_rt_detr_original_pytorch_checkpoint_to_pytorch.py"
756
+ )
757
+ image_processor.push_to_hub(
758
+ repo_id=repo_id,
759
+ commit_message="Add image processor from convert_rt_detr_original_pytorch_checkpoint_to_pytorch.py",
760
+ )
761
+
762
+
763
+ if __name__ == "__main__":
764
+ parser = argparse.ArgumentParser()
765
+ parser.add_argument(
766
+ "--model_name",
767
+ default="rtdetr_r50vd",
768
+ type=str,
769
+ help="model_name of the checkpoint you'd like to convert.",
770
+ )
771
+ parser.add_argument(
772
+ "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
773
+ )
774
+ parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.")
775
+ parser.add_argument(
776
+ "--repo_id",
777
+ type=str,
778
+ help="repo_id where the model will be pushed to.",
779
+ )
780
+ args = parser.parse_args()
781
+ convert_rt_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.repo_id)