Add TransformerLayer, TransformerBlock, C3TR modules (#2333)
Browse files* yolotr
* transformer block
* Remove bias in Transformer
* Remove C3T
* Remove a deprecated class
* put the 2nd LayerNorm into the 2nd residual block
* move example model to models/hub, rename to -transformer
* Add module comments and TODOs
* Remove LN in Transformer
* Add comments for Transformer
* Solve the problem of MA with DDP
* cleanup
* cleanup find_unused_parameters
* PEP8 reformat
Co-authored-by: DingYiwei <[email protected]>
Co-authored-by: Glenn Jocher <[email protected]>
- models/common.py +54 -0
- models/hub/yolov5s-transformer.yaml +48 -0
- models/yolo.py +2 -2
- train.py +3 -1
models/common.py
CHANGED
@@ -43,6 +43,52 @@ class Conv(nn.Module):
|
|
43 |
return self.act(self.conv(x))
|
44 |
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
class Bottleneck(nn.Module):
|
47 |
# Standard bottleneck
|
48 |
def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion
|
@@ -90,6 +136,14 @@ class C3(nn.Module):
|
|
90 |
return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
|
91 |
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
class SPP(nn.Module):
|
94 |
# Spatial pyramid pooling layer used in YOLOv3-SPP
|
95 |
def __init__(self, c1, c2, k=(5, 9, 13)):
|
|
|
43 |
return self.act(self.conv(x))
|
44 |
|
45 |
|
46 |
+
class TransformerLayer(nn.Module):
|
47 |
+
# Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)
|
48 |
+
def __init__(self, c, num_heads):
|
49 |
+
super().__init__()
|
50 |
+
self.q = nn.Linear(c, c, bias=False)
|
51 |
+
self.k = nn.Linear(c, c, bias=False)
|
52 |
+
self.v = nn.Linear(c, c, bias=False)
|
53 |
+
self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)
|
54 |
+
self.fc1 = nn.Linear(c, c, bias=False)
|
55 |
+
self.fc2 = nn.Linear(c, c, bias=False)
|
56 |
+
|
57 |
+
def forward(self, x):
|
58 |
+
x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
|
59 |
+
x = self.fc2(self.fc1(x)) + x
|
60 |
+
return x
|
61 |
+
|
62 |
+
|
63 |
+
class TransformerBlock(nn.Module):
|
64 |
+
# Vision Transformer https://arxiv.org/abs/2010.11929
|
65 |
+
def __init__(self, c1, c2, num_heads, num_layers):
|
66 |
+
super().__init__()
|
67 |
+
self.conv = None
|
68 |
+
if c1 != c2:
|
69 |
+
self.conv = Conv(c1, c2)
|
70 |
+
self.linear = nn.Linear(c2, c2) # learnable position embedding
|
71 |
+
self.tr = nn.Sequential(*[TransformerLayer(c2, num_heads) for _ in range(num_layers)])
|
72 |
+
self.c2 = c2
|
73 |
+
|
74 |
+
def forward(self, x):
|
75 |
+
if self.conv is not None:
|
76 |
+
x = self.conv(x)
|
77 |
+
b, _, w, h = x.shape
|
78 |
+
p = x.flatten(2)
|
79 |
+
p = p.unsqueeze(0)
|
80 |
+
p = p.transpose(0, 3)
|
81 |
+
p = p.squeeze(3)
|
82 |
+
e = self.linear(p)
|
83 |
+
x = p + e
|
84 |
+
|
85 |
+
x = self.tr(x)
|
86 |
+
x = x.unsqueeze(3)
|
87 |
+
x = x.transpose(0, 3)
|
88 |
+
x = x.reshape(b, self.c2, w, h)
|
89 |
+
return x
|
90 |
+
|
91 |
+
|
92 |
class Bottleneck(nn.Module):
|
93 |
# Standard bottleneck
|
94 |
def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion
|
|
|
136 |
return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
|
137 |
|
138 |
|
139 |
+
class C3TR(C3):
|
140 |
+
# C3 module with TransformerBlock()
|
141 |
+
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
142 |
+
super().__init__(c1, c2, n, shortcut, g, e)
|
143 |
+
c_ = int(c2 * e)
|
144 |
+
self.m = TransformerBlock(c_, c_, 4, n)
|
145 |
+
|
146 |
+
|
147 |
class SPP(nn.Module):
|
148 |
# Spatial pyramid pooling layer used in YOLOv3-SPP
|
149 |
def __init__(self, c1, c2, k=(5, 9, 13)):
|
models/hub/yolov5s-transformer.yaml
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# parameters
|
2 |
+
nc: 80 # number of classes
|
3 |
+
depth_multiple: 0.33 # model depth multiple
|
4 |
+
width_multiple: 0.50 # layer channel multiple
|
5 |
+
|
6 |
+
# anchors
|
7 |
+
anchors:
|
8 |
+
- [10,13, 16,30, 33,23] # P3/8
|
9 |
+
- [30,61, 62,45, 59,119] # P4/16
|
10 |
+
- [116,90, 156,198, 373,326] # P5/32
|
11 |
+
|
12 |
+
# YOLOv5 backbone
|
13 |
+
backbone:
|
14 |
+
# [from, number, module, args]
|
15 |
+
[[-1, 1, Focus, [64, 3]], # 0-P1/2
|
16 |
+
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
|
17 |
+
[-1, 3, C3, [128]],
|
18 |
+
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
|
19 |
+
[-1, 9, C3, [256]],
|
20 |
+
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
|
21 |
+
[-1, 9, C3, [512]],
|
22 |
+
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
|
23 |
+
[-1, 1, SPP, [1024, [5, 9, 13]]],
|
24 |
+
[-1, 3, C3TR, [1024, False]], # 9 <-------- C3TR() Transformer module
|
25 |
+
]
|
26 |
+
|
27 |
+
# YOLOv5 head
|
28 |
+
head:
|
29 |
+
[[-1, 1, Conv, [512, 1, 1]],
|
30 |
+
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
|
31 |
+
[[-1, 6], 1, Concat, [1]], # cat backbone P4
|
32 |
+
[-1, 3, C3, [512, False]], # 13
|
33 |
+
|
34 |
+
[-1, 1, Conv, [256, 1, 1]],
|
35 |
+
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
|
36 |
+
[[-1, 4], 1, Concat, [1]], # cat backbone P3
|
37 |
+
[-1, 3, C3, [256, False]], # 17 (P3/8-small)
|
38 |
+
|
39 |
+
[-1, 1, Conv, [256, 3, 2]],
|
40 |
+
[[-1, 14], 1, Concat, [1]], # cat head P4
|
41 |
+
[-1, 3, C3, [512, False]], # 20 (P4/16-medium)
|
42 |
+
|
43 |
+
[-1, 1, Conv, [512, 3, 2]],
|
44 |
+
[[-1, 10], 1, Concat, [1]], # cat head P5
|
45 |
+
[-1, 3, C3, [1024, False]], # 23 (P5/32-large)
|
46 |
+
|
47 |
+
[[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
|
48 |
+
]
|
models/yolo.py
CHANGED
@@ -215,13 +215,13 @@ def parse_model(d, ch): # model_dict, input_channels(3)
|
|
215 |
|
216 |
n = max(round(n * gd), 1) if n > 1 else n # depth gain
|
217 |
if m in [Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, DWConv, MixConv2d, Focus, CrossConv, BottleneckCSP,
|
218 |
-
C3]:
|
219 |
c1, c2 = ch[f], args[0]
|
220 |
if c2 != no: # if not output
|
221 |
c2 = make_divisible(c2 * gw, 8)
|
222 |
|
223 |
args = [c1, c2, *args[1:]]
|
224 |
-
if m in [BottleneckCSP, C3]:
|
225 |
args.insert(2, n) # number of repeats
|
226 |
n = 1
|
227 |
elif m is nn.BatchNorm2d:
|
|
|
215 |
|
216 |
n = max(round(n * gd), 1) if n > 1 else n # depth gain
|
217 |
if m in [Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, DWConv, MixConv2d, Focus, CrossConv, BottleneckCSP,
|
218 |
+
C3, C3TR]:
|
219 |
c1, c2 = ch[f], args[0]
|
220 |
if c2 != no: # if not output
|
221 |
c2 = make_divisible(c2 * gw, 8)
|
222 |
|
223 |
args = [c1, c2, *args[1:]]
|
224 |
+
if m in [BottleneckCSP, C3, C3TR]:
|
225 |
args.insert(2, n) # number of repeats
|
226 |
n = 1
|
227 |
elif m is nn.BatchNorm2d:
|
train.py
CHANGED
@@ -218,7 +218,9 @@ def train(hyp, opt, device, tb_writer=None):
|
|
218 |
|
219 |
# DDP mode
|
220 |
if cuda and rank != -1:
|
221 |
-
model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank
|
|
|
|
|
222 |
|
223 |
# Model parameters
|
224 |
hyp['box'] *= 3. / nl # scale to layers
|
|
|
218 |
|
219 |
# DDP mode
|
220 |
if cuda and rank != -1:
|
221 |
+
model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank,
|
222 |
+
# nn.MultiheadAttention incompatibility with DDP https://github.com/pytorch/pytorch/issues/26698
|
223 |
+
find_unused_parameters=any(isinstance(layer, nn.MultiheadAttention) for layer in model.modules()))
|
224 |
|
225 |
# Model parameters
|
226 |
hyp['box'] *= 3. / nl # scale to layers
|