henry000 commited on
Commit
d1aee89
·
1 Parent(s): 78e3679

✨ [New] v9-s, v9-m model! new model arch& weight

Browse files
yolo/config/model/v9-m.yaml ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ anchor:
2
+ reg_max: 16
3
+
4
+ model:
5
+ backbone:
6
+ - Conv:
7
+ args: {out_channels: 32, kernel_size: 3, stride: 2}
8
+ source: 0
9
+ - Conv:
10
+ args: {out_channels: 64, kernel_size: 3, stride: 2}
11
+ - RepNCSPELAN:
12
+ args: {out_channels: 128, part_channels: 128}
13
+
14
+ - AConv:
15
+ args: {out_channels: 240}
16
+ - RepNCSPELAN:
17
+ args: {out_channels: 240, part_channels: 240}
18
+ tags: B3
19
+
20
+ - AConv:
21
+ args: {out_channels: 360}
22
+ - RepNCSPELAN:
23
+ args: {out_channels: 360, part_channels: 360}
24
+ tags: B4
25
+
26
+ - AConv:
27
+ args: {out_channels: 480}
28
+ - RepNCSPELAN:
29
+ args: {out_channels: 480, part_channels: 480}
30
+ tags: B5
31
+
32
+ neck:
33
+ - SPPELAN:
34
+ args: {out_channels: 480}
35
+ tags: N3
36
+
37
+ - UpSample:
38
+ args: {scale_factor: 2, mode: nearest}
39
+ - Concat:
40
+ source: [-1, B4]
41
+ - RepNCSPELAN:
42
+ args: {out_channels: 360, part_channels: 360}
43
+ tags: N4
44
+
45
+ - UpSample:
46
+ args: {scale_factor: 2, mode: nearest}
47
+ - Concat:
48
+ source: [-1, B3]
49
+
50
+ head:
51
+ - RepNCSPELAN:
52
+ args: {out_channels: 240, part_channels: 240}
53
+ tags: P3
54
+
55
+ - AConv:
56
+ args: {out_channels: 184}
57
+ - Concat:
58
+ source: [-1, N4]
59
+ - RepNCSPELAN:
60
+ args: {out_channels: 360, part_channels: 360}
61
+ tags: P4
62
+
63
+ - AConv:
64
+ args: {out_channels: 240}
65
+ - Concat:
66
+ source: [-1, N3]
67
+ - RepNCSPELAN:
68
+ args: {out_channels: 480, part_channels: 480}
69
+ tags: P5
70
+
71
+ detection:
72
+ - MultiheadDetection:
73
+ source: [P3, P4, P5]
74
+ tags: Main
75
+ args:
76
+ reg_max: ${model.anchor.reg_max}
77
+ output: True
78
+
79
+ auxiliary:
80
+ - CBLinear:
81
+ source: B3
82
+ args: {out_channels: [240]}
83
+ tags: R3
84
+ - CBLinear:
85
+ source: B4
86
+ args: {out_channels: [240, 360]}
87
+ tags: R4
88
+ - CBLinear:
89
+ source: B5
90
+ args: {out_channels: [240, 360, 480]}
91
+ tags: R5
92
+
93
+ - Conv:
94
+ args: {out_channels: 32, kernel_size: 3, stride: 2}
95
+ source: 0
96
+ - Conv:
97
+ args: {out_channels: 64, kernel_size: 3, stride: 2}
98
+ - RepNCSPELAN:
99
+ args: {out_channels: 128, part_channels: 128}
100
+
101
+ - AConv:
102
+ args: {out_channels: 240}
103
+ - CBFuse:
104
+ source: [R3, R4, R5, -1]
105
+ args: {index: [0, 0, 0]}
106
+ - RepNCSPELAN:
107
+ args: {out_channels: 240, part_channels: 240}
108
+ tags: A3
109
+
110
+ - AConv:
111
+ args: {out_channels: 360}
112
+ - CBFuse:
113
+ source: [R4, R5, -1]
114
+ args: {index: [1, 1]}
115
+ - RepNCSPELAN:
116
+ args: {out_channels: 360, part_channels: 360}
117
+ tags: A4
118
+
119
+ - AConv:
120
+ args: {out_channels: 480}
121
+ - CBFuse:
122
+ source: [R5, -1]
123
+ args: {index: [2]}
124
+ - RepNCSPELAN:
125
+ args: {out_channels: 480, part_channels: 480}
126
+ tags: A5
127
+
128
+ - MultiheadDetection:
129
+ source: [A3, A4, A5]
130
+ tags: AUX
131
+ args:
132
+ reg_max: ${model.anchor.reg_max}
133
+ output: True
yolo/config/model/v9-s.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ anchor:
2
+ reg_max: 16
3
+
4
+ model:
5
+ backbone:
6
+ - Conv:
7
+ args: {out_channels: 32, kernel_size: 3, stride: 2}
8
+ source: 0
9
+ - Conv:
10
+ args: {out_channels: 64, kernel_size: 3, stride: 2}
11
+ - ELAN:
12
+ args: {out_channels: 64, part_channels: 64}
13
+
14
+ - AConv:
15
+ args: {out_channels: 128}
16
+ - RepNCSPELAN:
17
+ args:
18
+ out_channels: 128
19
+ part_channels: 128
20
+ csp_args: {repeat_num: 3}
21
+ tags: B3 # 18
22
+
23
+ - AConv:
24
+ args: {out_channels: 192}
25
+ - RepNCSPELAN:
26
+ args:
27
+ out_channels: 192
28
+ part_channels: 192
29
+ csp_args: {repeat_num: 3}
30
+ tags: B4
31
+
32
+ - AConv:
33
+ args: {out_channels: 256}
34
+ - RepNCSPELAN:
35
+ args:
36
+ out_channels: 256
37
+ part_channels: 256
38
+ csp_args: {repeat_num: 3}
39
+ tags: B5
40
+
41
+ neck:
42
+ - SPPELAN:
43
+ args: {out_channels: 256}
44
+ tags: N3
45
+
46
+ - UpSample:
47
+ args: {scale_factor: 2, mode: nearest}
48
+ - Concat:
49
+ source: [-1, B4]
50
+ - RepNCSPELAN:
51
+ args:
52
+ out_channels: 192
53
+ part_channels: 192
54
+ csp_args: {repeat_num: 3}
55
+ tags: N4
56
+
57
+ - UpSample:
58
+ args: {scale_factor: 2, mode: nearest}
59
+ - Concat:
60
+ source: [-1, B3]
61
+
62
+ - RepNCSPELAN:
63
+ args:
64
+ out_channels: 128
65
+ part_channels: 128
66
+ csp_args: {repeat_num: 3}
67
+ tags: P3
68
+ - AConv:
69
+ args: {out_channels: 96}
70
+ - Concat:
71
+ source: [-1, N4]
72
+
73
+ - RepNCSPELAN:
74
+ args:
75
+ out_channels: 192
76
+ part_channels: 192
77
+ csp_args: {repeat_num: 3}
78
+ tags: P4
79
+ - AConv:
80
+ args: {out_channels: 128}
81
+ - Concat:
82
+ source: [-1, N3]
83
+
84
+ - RepNCSPELAN:
85
+ args:
86
+ out_channels: 256
87
+ part_channels: 256
88
+ csp_args: {repeat_num: 3}
89
+ tags: P5
90
+
91
+ detection:
92
+ - MultiheadDetection:
93
+ source: [P3, P4, P5]
94
+ tags: Main
95
+ args:
96
+ reg_max: ${model.anchor.reg_max}
97
+ output: True
98
+
99
+ head:
100
+ - SPPELAN:
101
+ source: B5
102
+ args: {out_channels: 256}
103
+ tags: A5
104
+
105
+ - UpSample:
106
+ args: {scale_factor: 2, mode: nearest}
107
+ - Concat:
108
+ source: [-1, B4]
109
+
110
+ - RepNCSPELAN:
111
+ args:
112
+ out_channels: 192
113
+ part_channels: 192
114
+ csp_args: {repeat_num: 3}
115
+ tags: A4
116
+
117
+ - UpSample:
118
+ args: {scale_factor: 2, mode: nearest}
119
+ - Concat:
120
+ source: [-1, B3]
121
+
122
+ - RepNCSPELAN:
123
+ args:
124
+ out_channels: 128
125
+ part_channels: 128
126
+ csp_args: {repeat_num: 3}
127
+ tags: A3
128
+
129
+ - MultiheadDetection:
130
+ source: [A3, A4, A5]
131
+ tags: AUX
132
+ args:
133
+ reg_max: ${model.anchor.reg_max}
134
+ output: True
yolo/model/module.py CHANGED
@@ -192,6 +192,36 @@ class RepNCSP(nn.Module):
192
  return self.conv3(torch.cat((x1, x2), dim=1))
193
 
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  class RepNCSPELAN(nn.Module):
196
  """RepNCSPELAN block combining RepNCSP blocks with ELAN structure."""
197
 
@@ -230,6 +260,21 @@ class RepNCSPELAN(nn.Module):
230
  return x5
231
 
232
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  class ADown(nn.Module):
234
  """Downsampling module combining average and max pooling with convolution for feature reduction."""
235
 
@@ -498,26 +543,6 @@ class CSPDark(nn.Module):
498
  return self.cv2(torch.cat((self.cb(y[0]), y[1]), 1))
499
 
500
 
501
- # ELAN
502
- class ELAN(nn.Module):
503
- # ELAN
504
- def __init__(self, in_channels, out_channels, med_channels, elan_repeat=2, cb_repeat=2, ratio=1.0):
505
-
506
- super().__init__()
507
-
508
- h_channels = med_channels // 2
509
- self.cv1 = Conv(in_channels, med_channels, 1, 1)
510
- self.cb = nn.ModuleList(ConvBlock(h_channels, repeat=cb_repeat, ratio=ratio) for _ in range(elan_repeat))
511
- self.cv2 = Conv((2 + elan_repeat) * h_channels, out_channels, 1, 1)
512
-
513
- def forward(self, x):
514
-
515
- y = list(self.cv1(x).chunk(2, 1))
516
- y.extend((m(y[-1])) for m in self.cb)
517
-
518
- return self.cv2(torch.cat(y, 1))
519
-
520
-
521
  class CSPELAN(nn.Module):
522
  # ELAN
523
  def __init__(self, in_channels, out_channels, med_channels, elan_repeat=2, cb_repeat=2, ratio=1.0):
 
192
  return self.conv3(torch.cat((x1, x2), dim=1))
193
 
194
 
195
+ class ELAN(nn.Module):
196
+ """ELAN structure."""
197
+
198
+ def __init__(
199
+ self,
200
+ in_channels: int,
201
+ out_channels: int,
202
+ part_channels: int,
203
+ *,
204
+ process_channels: Optional[int] = None,
205
+ **kwargs,
206
+ ):
207
+ super().__init__()
208
+
209
+ if process_channels is None:
210
+ process_channels = part_channels // 2
211
+
212
+ self.conv1 = Conv(in_channels, part_channels, 1, **kwargs)
213
+ self.conv2 = Conv(part_channels // 2, process_channels, 3, padding=1, **kwargs)
214
+ self.conv3 = Conv(process_channels, process_channels, 3, padding=1, **kwargs)
215
+ self.conv4 = Conv(part_channels + 2 * process_channels, out_channels, 1, **kwargs)
216
+
217
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
218
+ x1, x2 = self.conv1(x).chunk(2, 1)
219
+ x3 = self.conv2(x2)
220
+ x4 = self.conv3(x3)
221
+ x5 = self.conv4(torch.cat([x1, x2, x3, x4], dim=1))
222
+ return x5
223
+
224
+
225
  class RepNCSPELAN(nn.Module):
226
  """RepNCSPELAN block combining RepNCSP blocks with ELAN structure."""
227
 
 
260
  return x5
261
 
262
 
263
+ class AConv(nn.Module):
264
+ """Downsampling module combining average and max pooling with convolution for feature reduction."""
265
+
266
+ def __init__(self, in_channels: int, out_channels: int):
267
+ super().__init__()
268
+ mid_layer = {"kernel_size": 3, "stride": 2}
269
+ self.avg_pool = Pool("avg", kernel_size=2, stride=1)
270
+ self.conv = Conv(in_channels, out_channels, **mid_layer)
271
+
272
+ def forward(self, x: Tensor) -> Tensor:
273
+ x = self.avg_pool(x)
274
+ x = self.conv(x)
275
+ return x
276
+
277
+
278
  class ADown(nn.Module):
279
  """Downsampling module combining average and max pooling with convolution for feature reduction."""
280
 
 
543
  return self.cv2(torch.cat((self.cb(y[0]), y[1]), 1))
544
 
545
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
546
  class CSPELAN(nn.Module):
547
  # ELAN
548
  def __init__(self, in_channels, out_channels, med_channels, elan_repeat=2, cb_repeat=2, ratio=1.0):