hujiecpp commited on
Commit
4d17f72
·
1 Parent(s): 7b1db44

init project

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. modules/mobilesamv2/promt_mobilesamv2/model.py +5 -5
  2. modules/pe3r/models.py +3 -5
  3. modules/ultralytics/__init__.py +12 -0
  4. modules/ultralytics/__pycache__/__init__.cpython-312.pyc +0 -0
  5. modules/ultralytics/assets/bus.jpg +0 -0
  6. modules/ultralytics/assets/zidane.jpg +0 -0
  7. modules/ultralytics/hub/__init__.py +117 -0
  8. modules/ultralytics/hub/__pycache__/__init__.cpython-312.pyc +0 -0
  9. modules/ultralytics/hub/__pycache__/auth.cpython-312.pyc +0 -0
  10. modules/ultralytics/hub/__pycache__/utils.cpython-312.pyc +0 -0
  11. modules/ultralytics/hub/auth.py +139 -0
  12. modules/ultralytics/hub/session.py +189 -0
  13. modules/ultralytics/hub/utils.py +217 -0
  14. modules/ultralytics/models/README.md +45 -0
  15. modules/ultralytics/models/rt-detr/rtdetr-l.yaml +50 -0
  16. modules/ultralytics/models/rt-detr/rtdetr-x.yaml +54 -0
  17. modules/ultralytics/models/v3/yolov3-spp.yaml +48 -0
  18. modules/ultralytics/models/v3/yolov3-tiny.yaml +39 -0
  19. modules/ultralytics/models/v3/yolov3.yaml +48 -0
  20. modules/ultralytics/models/v5/yolov5-p6.yaml +61 -0
  21. modules/ultralytics/models/v5/yolov5.yaml +50 -0
  22. modules/ultralytics/models/v6/yolov6.yaml +53 -0
  23. modules/ultralytics/models/v8/yolov8-cls.yaml +29 -0
  24. modules/ultralytics/models/v8/yolov8-p2.yaml +54 -0
  25. modules/ultralytics/models/v8/yolov8-p6.yaml +56 -0
  26. modules/ultralytics/models/v8/yolov8-pose-p6.yaml +57 -0
  27. modules/ultralytics/models/v8/yolov8-pose.yaml +47 -0
  28. modules/ultralytics/models/v8/yolov8-rtdetr.yaml +46 -0
  29. modules/ultralytics/models/v8/yolov8-seg.yaml +46 -0
  30. modules/ultralytics/models/v8/yolov8.yaml +46 -0
  31. modules/ultralytics/nn/__init__.py +9 -0
  32. modules/ultralytics/nn/__pycache__/__init__.cpython-312.pyc +0 -0
  33. modules/ultralytics/nn/__pycache__/autobackend.cpython-312.pyc +0 -0
  34. modules/ultralytics/nn/__pycache__/tasks.cpython-312.pyc +0 -0
  35. modules/ultralytics/nn/autobackend.py +455 -0
  36. modules/ultralytics/nn/autoshape.py +244 -0
  37. modules/ultralytics/nn/modules/__init__.py +29 -0
  38. modules/ultralytics/nn/modules/__pycache__/__init__.cpython-312.pyc +0 -0
  39. modules/ultralytics/nn/modules/__pycache__/block.cpython-312.pyc +0 -0
  40. modules/ultralytics/nn/modules/__pycache__/conv.cpython-312.pyc +0 -0
  41. modules/ultralytics/nn/modules/__pycache__/head.cpython-312.pyc +0 -0
  42. modules/ultralytics/nn/modules/__pycache__/transformer.cpython-312.pyc +0 -0
  43. modules/ultralytics/nn/modules/__pycache__/utils.cpython-312.pyc +0 -0
  44. modules/ultralytics/nn/modules/block.py +304 -0
  45. modules/ultralytics/nn/modules/conv.py +297 -0
  46. modules/ultralytics/nn/modules/head.py +351 -0
  47. modules/ultralytics/nn/modules/transformer.py +378 -0
  48. modules/ultralytics/nn/modules/utils.py +78 -0
  49. modules/ultralytics/nn/tasks.py +780 -0
  50. modules/ultralytics/tracker/README.md +86 -0
modules/mobilesamv2/promt_mobilesamv2/model.py CHANGED
@@ -1,9 +1,9 @@
1
- from ultralytics.yolo.cfg import get_cfg
2
- from ultralytics.yolo.engine.exporter import Exporter
3
- from ultralytics.yolo.engine.model import YOLO
4
- from ultralytics.yolo.utils import DEFAULT_CFG, LOGGER, ROOT, is_git_dir
5
 
6
- from ultralytics.yolo.utils.torch_utils import model_info, smart_inference_mode
7
  from .predict import PromptModelPredictor
8
 
9
 
 
1
+ from modules.ultralytics.yolo.cfg import get_cfg
2
+ from modules.ultralytics.yolo.engine.exporter import Exporter
3
+ from modules.ultralytics.yolo.engine.model import YOLO
4
+ from modules.ultralytics.yolo.utils import DEFAULT_CFG, LOGGER, ROOT, is_git_dir
5
 
6
+ from modules.ultralytics.yolo.utils.torch_utils import model_info, smart_inference_mode
7
  from .predict import PromptModelPredictor
8
 
9
 
modules/pe3r/models.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import sys
3
- # sys.path.append(os.path.abspath('./modules/ultralytics'))
4
 
5
  from transformers import AutoTokenizer, AutoModel, AutoProcessor, SamModel
6
  from modules.mast3r.model import AsymmetricMASt3R
@@ -12,7 +12,6 @@ from modules.mobilesamv2 import sam_model_registry
12
  from sam2.sam2_video_predictor import SAM2VideoPredictor
13
  import spaces
14
  import torch
15
- from ultralytics import YOLOvv8
16
 
17
  class Models:
18
  @spaces.GPU
@@ -48,9 +47,8 @@ class Models:
48
  self.mobilesamv2.eval()
49
 
50
  # -- yolov8 --
51
- # YOLO8_CKP='./checkpoints/ObjectAwareModel.pt'
52
- # self.yolov8 = ObjectAwareModel(YOLO8_CKP)
53
- self.yolov8 = YOLOvv8.from_pretrained("Ultralytics/YOLOv8")
54
 
55
  # -- siglip --
56
  self.siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
 
1
  import os
2
  import sys
3
+ sys.path.append(os.path.abspath('./modules/ultralytics'))
4
 
5
  from transformers import AutoTokenizer, AutoModel, AutoProcessor, SamModel
6
  from modules.mast3r.model import AsymmetricMASt3R
 
12
  from sam2.sam2_video_predictor import SAM2VideoPredictor
13
  import spaces
14
  import torch
 
15
 
16
  class Models:
17
  @spaces.GPU
 
47
  self.mobilesamv2.eval()
48
 
49
  # -- yolov8 --
50
+ YOLO8_CKP='./checkpoints/ObjectAwareModel.pt'
51
+ self.yolov8 = ObjectAwareModel(YOLO8_CKP)
 
52
 
53
  # -- siglip --
54
  self.siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
modules/ultralytics/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+
3
+ __version__ = '8.0.120'
4
+
5
+ from ultralytics.hub import start
6
+ from ultralytics.vit.rtdetr import RTDETR
7
+ from ultralytics.vit.sam import SAM
8
+ from ultralytics.yolo.engine.model import YOLO
9
+ from ultralytics.yolo.nas import NAS
10
+ from ultralytics.yolo.utils.checks import check_yolo as checks
11
+
12
+ __all__ = '__version__', 'YOLO', 'NAS', 'SAM', 'RTDETR', 'checks', 'start' # allow simpler import
modules/ultralytics/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (579 Bytes). View file
 
modules/ultralytics/assets/bus.jpg ADDED
modules/ultralytics/assets/zidane.jpg ADDED
modules/ultralytics/hub/__init__.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+
3
+ import requests
4
+
5
+ from ultralytics.hub.auth import Auth
6
+ from ultralytics.hub.utils import PREFIX
7
+ from ultralytics.yolo.data.utils import HUBDatasetStats
8
+ from ultralytics.yolo.utils import LOGGER, SETTINGS, USER_CONFIG_DIR, yaml_save
9
+
10
+
11
+ def login(api_key=''):
12
+ """
13
+ Log in to the Ultralytics HUB API using the provided API key.
14
+
15
+ Args:
16
+ api_key (str, optional): May be an API key or a combination API key and model ID, i.e. key_id
17
+
18
+ Example:
19
+ from ultralytics import hub
20
+ hub.login('API_KEY')
21
+ """
22
+ Auth(api_key, verbose=True)
23
+
24
+
25
+ def logout():
26
+ """
27
+ Log out of Ultralytics HUB by removing the API key from the settings file. To log in again, use 'yolo hub login'.
28
+
29
+ Example:
30
+ from ultralytics import hub
31
+ hub.logout()
32
+ """
33
+ SETTINGS['api_key'] = ''
34
+ yaml_save(USER_CONFIG_DIR / 'settings.yaml', SETTINGS)
35
+ LOGGER.info(f"{PREFIX}logged out ✅. To log in again, use 'yolo hub login'.")
36
+
37
+
38
+ def start(key=''):
39
+ """
40
+ Start training models with Ultralytics HUB (DEPRECATED).
41
+
42
+ Args:
43
+ key (str, optional): A string containing either the API key and model ID combination (apikey_modelid),
44
+ or the full model URL (https://hub.ultralytics.com/models/apikey_modelid).
45
+ """
46
+ api_key, model_id = key.split('_')
47
+ LOGGER.warning(f"""
48
+ WARNING ⚠️ ultralytics.start() is deprecated after 8.0.60. Updated usage to train Ultralytics HUB models is:
49
+
50
+ from ultralytics import YOLO, hub
51
+
52
+ hub.login('{api_key}')
53
+ model = YOLO('https://hub.ultralytics.com/models/{model_id}')
54
+ model.train()""")
55
+
56
+
57
+ def reset_model(model_id=''):
58
+ """Reset a trained model to an untrained state."""
59
+ r = requests.post('https://api.ultralytics.com/model-reset', json={'apiKey': Auth().api_key, 'modelId': model_id})
60
+ if r.status_code == 200:
61
+ LOGGER.info(f'{PREFIX}Model reset successfully')
62
+ return
63
+ LOGGER.warning(f'{PREFIX}Model reset failure {r.status_code} {r.reason}')
64
+
65
+
66
+ def export_fmts_hub():
67
+ """Returns a list of HUB-supported export formats."""
68
+ from ultralytics.yolo.engine.exporter import export_formats
69
+ return list(export_formats()['Argument'][1:]) + ['ultralytics_tflite', 'ultralytics_coreml']
70
+
71
+
72
+ def export_model(model_id='', format='torchscript'):
73
+ """Export a model to all formats."""
74
+ assert format in export_fmts_hub(), f"Unsupported export format '{format}', valid formats are {export_fmts_hub()}"
75
+ r = requests.post(f'https://api.ultralytics.com/v1/models/{model_id}/export',
76
+ json={'format': format},
77
+ headers={'x-api-key': Auth().api_key})
78
+ assert r.status_code == 200, f'{PREFIX}{format} export failure {r.status_code} {r.reason}'
79
+ LOGGER.info(f'{PREFIX}{format} export started ✅')
80
+
81
+
82
+ def get_export(model_id='', format='torchscript'):
83
+ """Get an exported model dictionary with download URL."""
84
+ assert format in export_fmts_hub(), f"Unsupported export format '{format}', valid formats are {export_fmts_hub()}"
85
+ r = requests.post('https://api.ultralytics.com/get-export',
86
+ json={
87
+ 'apiKey': Auth().api_key,
88
+ 'modelId': model_id,
89
+ 'format': format})
90
+ assert r.status_code == 200, f'{PREFIX}{format} get_export failure {r.status_code} {r.reason}'
91
+ return r.json()
92
+
93
+
94
+ def check_dataset(path='', task='detect'):
95
+ """
96
+ Function for error-checking HUB dataset Zip file before upload. It checks a dataset for errors before it is
97
+ uploaded to the HUB. Usage examples are given below.
98
+
99
+ Args:
100
+ path (str, optional): Path to data.zip (with data.yaml inside data.zip). Defaults to ''.
101
+ task (str, optional): Dataset task. Options are 'detect', 'segment', 'pose', 'classify'. Defaults to 'detect'.
102
+
103
+ Example:
104
+ ```python
105
+ from ultralytics.hub import check_dataset
106
+
107
+ check_dataset('path/to/coco8.zip', task='detect') # detect dataset
108
+ check_dataset('path/to/coco8-seg.zip', task='segment') # segment dataset
109
+ check_dataset('path/to/coco8-pose.zip', task='pose') # pose dataset
110
+ ```
111
+ """
112
+ HUBDatasetStats(path=path, task=task).get_json()
113
+ LOGGER.info('Checks completed correctly ✅. Upload this dataset to https://hub.ultralytics.com/datasets/.')
114
+
115
+
116
+ if __name__ == '__main__':
117
+ start()
modules/ultralytics/hub/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (6.3 kB). View file
 
modules/ultralytics/hub/__pycache__/auth.cpython-312.pyc ADDED
Binary file (6.01 kB). View file
 
modules/ultralytics/hub/__pycache__/utils.cpython-312.pyc ADDED
Binary file (10.8 kB). View file
 
modules/ultralytics/hub/auth.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+
3
+ import requests
4
+
5
+ from ultralytics.hub.utils import HUB_API_ROOT, PREFIX, request_with_credentials
6
+ from ultralytics.yolo.utils import LOGGER, SETTINGS, emojis, is_colab, set_settings
7
+
8
+ API_KEY_URL = 'https://hub.ultralytics.com/settings?tab=api+keys'
9
+
10
+
11
+ class Auth:
12
+ id_token = api_key = model_key = False
13
+
14
+ def __init__(self, api_key='', verbose=False):
15
+ """
16
+ Initialize the Auth class with an optional API key.
17
+
18
+ Args:
19
+ api_key (str, optional): May be an API key or a combination API key and model ID, i.e. key_id
20
+ """
21
+ # Split the input API key in case it contains a combined key_model and keep only the API key part
22
+ api_key = api_key.split('_')[0]
23
+
24
+ # Set API key attribute as value passed or SETTINGS API key if none passed
25
+ self.api_key = api_key or SETTINGS.get('api_key', '')
26
+
27
+ # If an API key is provided
28
+ if self.api_key:
29
+ # If the provided API key matches the API key in the SETTINGS
30
+ if self.api_key == SETTINGS.get('api_key'):
31
+ # Log that the user is already logged in
32
+ if verbose:
33
+ LOGGER.info(f'{PREFIX}Authenticated ✅')
34
+ return
35
+ else:
36
+ # Attempt to authenticate with the provided API key
37
+ success = self.authenticate()
38
+ # If the API key is not provided and the environment is a Google Colab notebook
39
+ elif is_colab():
40
+ # Attempt to authenticate using browser cookies
41
+ success = self.auth_with_cookies()
42
+ else:
43
+ # Request an API key
44
+ success = self.request_api_key()
45
+
46
+ # Update SETTINGS with the new API key after successful authentication
47
+ if success:
48
+ set_settings({'api_key': self.api_key})
49
+ # Log that the new login was successful
50
+ if verbose:
51
+ LOGGER.info(f'{PREFIX}New authentication successful ✅')
52
+ elif verbose:
53
+ LOGGER.info(f'{PREFIX}Retrieve API key from {API_KEY_URL}')
54
+
55
+ def request_api_key(self, max_attempts=3):
56
+ """
57
+ Prompt the user to input their API key. Returns the model ID.
58
+ """
59
+ import getpass
60
+ for attempts in range(max_attempts):
61
+ LOGGER.info(f'{PREFIX}Login. Attempt {attempts + 1} of {max_attempts}')
62
+ input_key = getpass.getpass(f'Enter API key from {API_KEY_URL} ')
63
+ self.api_key = input_key.split('_')[0] # remove model id if present
64
+ if self.authenticate():
65
+ return True
66
+ raise ConnectionError(emojis(f'{PREFIX}Failed to authenticate ❌'))
67
+
68
+ def authenticate(self) -> bool:
69
+ """
70
+ Attempt to authenticate with the server using either id_token or API key.
71
+
72
+ Returns:
73
+ bool: True if authentication is successful, False otherwise.
74
+ """
75
+ try:
76
+ header = self.get_auth_header()
77
+ if header:
78
+ r = requests.post(f'{HUB_API_ROOT}/v1/auth', headers=header)
79
+ if not r.json().get('success', False):
80
+ raise ConnectionError('Unable to authenticate.')
81
+ return True
82
+ raise ConnectionError('User has not authenticated locally.')
83
+ except ConnectionError:
84
+ self.id_token = self.api_key = False # reset invalid
85
+ LOGGER.warning(f'{PREFIX}Invalid API key ⚠️')
86
+ return False
87
+
88
+ def auth_with_cookies(self) -> bool:
89
+ """
90
+ Attempt to fetch authentication via cookies and set id_token.
91
+ User must be logged in to HUB and running in a supported browser.
92
+
93
+ Returns:
94
+ bool: True if authentication is successful, False otherwise.
95
+ """
96
+ if not is_colab():
97
+ return False # Currently only works with Colab
98
+ try:
99
+ authn = request_with_credentials(f'{HUB_API_ROOT}/v1/auth/auto')
100
+ if authn.get('success', False):
101
+ self.id_token = authn.get('data', {}).get('idToken', None)
102
+ self.authenticate()
103
+ return True
104
+ raise ConnectionError('Unable to fetch browser authentication details.')
105
+ except ConnectionError:
106
+ self.id_token = False # reset invalid
107
+ return False
108
+
109
+ def get_auth_header(self):
110
+ """
111
+ Get the authentication header for making API requests.
112
+
113
+ Returns:
114
+ (dict): The authentication header if id_token or API key is set, None otherwise.
115
+ """
116
+ if self.id_token:
117
+ return {'authorization': f'Bearer {self.id_token}'}
118
+ elif self.api_key:
119
+ return {'x-api-key': self.api_key}
120
+ else:
121
+ return None
122
+
123
+ def get_state(self) -> bool:
124
+ """
125
+ Get the authentication state.
126
+
127
+ Returns:
128
+ bool: True if either id_token or API key is set, False otherwise.
129
+ """
130
+ return self.id_token or self.api_key
131
+
132
+ def set_api_key(self, key: str):
133
+ """
134
+ Set the API key for authentication.
135
+
136
+ Args:
137
+ key (str): The API key string.
138
+ """
139
+ self.api_key = key
modules/ultralytics/hub/session.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ import signal
3
+ import sys
4
+ from pathlib import Path
5
+ from time import sleep
6
+
7
+ import requests
8
+
9
+ from ultralytics.hub.utils import HUB_API_ROOT, PREFIX, smart_request
10
+ from ultralytics.yolo.utils import LOGGER, __version__, checks, emojis, is_colab, threaded
11
+ from ultralytics.yolo.utils.errors import HUBModelError
12
+
13
+ AGENT_NAME = f'python-{__version__}-colab' if is_colab() else f'python-{__version__}-local'
14
+
15
+
16
+ class HUBTrainingSession:
17
+ """
18
+ HUB training session for Ultralytics HUB YOLO models. Handles model initialization, heartbeats, and checkpointing.
19
+
20
+ Args:
21
+ url (str): Model identifier used to initialize the HUB training session.
22
+
23
+ Attributes:
24
+ agent_id (str): Identifier for the instance communicating with the server.
25
+ model_id (str): Identifier for the YOLOv5 model being trained.
26
+ model_url (str): URL for the model in Ultralytics HUB.
27
+ api_url (str): API URL for the model in Ultralytics HUB.
28
+ auth_header (Dict): Authentication header for the Ultralytics HUB API requests.
29
+ rate_limits (Dict): Rate limits for different API calls (in seconds).
30
+ timers (Dict): Timers for rate limiting.
31
+ metrics_queue (Dict): Queue for the model's metrics.
32
+ model (Dict): Model data fetched from Ultralytics HUB.
33
+ alive (bool): Indicates if the heartbeat loop is active.
34
+ """
35
+
36
+ def __init__(self, url):
37
+ """
38
+ Initialize the HUBTrainingSession with the provided model identifier.
39
+
40
+ Args:
41
+ url (str): Model identifier used to initialize the HUB training session.
42
+ It can be a URL string or a model key with specific format.
43
+
44
+ Raises:
45
+ ValueError: If the provided model identifier is invalid.
46
+ ConnectionError: If connecting with global API key is not supported.
47
+ """
48
+
49
+ from ultralytics.hub.auth import Auth
50
+
51
+ # Parse input
52
+ if url.startswith('https://hub.ultralytics.com/models/'):
53
+ url = url.split('https://hub.ultralytics.com/models/')[-1]
54
+ if [len(x) for x in url.split('_')] == [42, 20]:
55
+ key, model_id = url.split('_')
56
+ elif len(url) == 20:
57
+ key, model_id = '', url
58
+ else:
59
+ raise HUBModelError(f"model='{url}' not found. Check format is correct, i.e. "
60
+ f"model='https://hub.ultralytics.com/models/MODEL_ID' and try again.")
61
+
62
+ # Authorize
63
+ auth = Auth(key)
64
+ self.agent_id = None # identifies which instance is communicating with server
65
+ self.model_id = model_id
66
+ self.model_url = f'https://hub.ultralytics.com/models/{model_id}'
67
+ self.api_url = f'{HUB_API_ROOT}/v1/models/{model_id}'
68
+ self.auth_header = auth.get_auth_header()
69
+ self.rate_limits = {'metrics': 3.0, 'ckpt': 900.0, 'heartbeat': 300.0} # rate limits (seconds)
70
+ self.timers = {} # rate limit timers (seconds)
71
+ self.metrics_queue = {} # metrics queue
72
+ self.model = self._get_model()
73
+ self.alive = True
74
+ self._start_heartbeat() # start heartbeats
75
+ self._register_signal_handlers()
76
+ LOGGER.info(f'{PREFIX}View model at {self.model_url} 🚀')
77
+
78
+ def _register_signal_handlers(self):
79
+ """Register signal handlers for SIGTERM and SIGINT signals to gracefully handle termination."""
80
+ signal.signal(signal.SIGTERM, self._handle_signal)
81
+ signal.signal(signal.SIGINT, self._handle_signal)
82
+
83
+ def _handle_signal(self, signum, frame):
84
+ """
85
+ Handle kill signals and prevent heartbeats from being sent on Colab after termination.
86
+ This method does not use frame, it is included as it is passed by signal.
87
+ """
88
+ if self.alive is True:
89
+ LOGGER.info(f'{PREFIX}Kill signal received! ❌')
90
+ self._stop_heartbeat()
91
+ sys.exit(signum)
92
+
93
+ def _stop_heartbeat(self):
94
+ """Terminate the heartbeat loop."""
95
+ self.alive = False
96
+
97
+ def upload_metrics(self):
98
+ """Upload model metrics to Ultralytics HUB."""
99
+ payload = {'metrics': self.metrics_queue.copy(), 'type': 'metrics'}
100
+ smart_request('post', self.api_url, json=payload, headers=self.auth_header, code=2)
101
+
102
+ def _get_model(self):
103
+ """Fetch and return model data from Ultralytics HUB."""
104
+ api_url = f'{HUB_API_ROOT}/v1/models/{self.model_id}'
105
+
106
+ try:
107
+ response = smart_request('get', api_url, headers=self.auth_header, thread=False, code=0)
108
+ data = response.json().get('data', None)
109
+
110
+ if data.get('status', None) == 'trained':
111
+ raise ValueError(emojis(f'Model is already trained and uploaded to {self.model_url} 🚀'))
112
+
113
+ if not data.get('data', None):
114
+ raise ValueError('Dataset may still be processing. Please wait a minute and try again.') # RF fix
115
+ self.model_id = data['id']
116
+
117
+ if data['status'] == 'new': # new model to start training
118
+ self.train_args = {
119
+ # TODO: deprecate 'batch_size' key for 'batch' in 3Q23
120
+ 'batch': data['batch' if ('batch' in data) else 'batch_size'],
121
+ 'epochs': data['epochs'],
122
+ 'imgsz': data['imgsz'],
123
+ 'patience': data['patience'],
124
+ 'device': data['device'],
125
+ 'cache': data['cache'],
126
+ 'data': data['data']}
127
+ self.model_file = data.get('cfg') or data.get('weights') # cfg for pretrained=False
128
+ self.model_file = checks.check_yolov5u_filename(self.model_file, verbose=False) # YOLOv5->YOLOv5u
129
+ elif data['status'] == 'training': # existing model to resume training
130
+ self.train_args = {'data': data['data'], 'resume': True}
131
+ self.model_file = data['resume']
132
+
133
+ return data
134
+ except requests.exceptions.ConnectionError as e:
135
+ raise ConnectionRefusedError('ERROR: The HUB server is not online. Please try again later.') from e
136
+ except Exception:
137
+ raise
138
+
139
+ def upload_model(self, epoch, weights, is_best=False, map=0.0, final=False):
140
+ """
141
+ Upload a model checkpoint to Ultralytics HUB.
142
+
143
+ Args:
144
+ epoch (int): The current training epoch.
145
+ weights (str): Path to the model weights file.
146
+ is_best (bool): Indicates if the current model is the best one so far.
147
+ map (float): Mean average precision of the model.
148
+ final (bool): Indicates if the model is the final model after training.
149
+ """
150
+ if Path(weights).is_file():
151
+ with open(weights, 'rb') as f:
152
+ file = f.read()
153
+ else:
154
+ LOGGER.warning(f'{PREFIX}WARNING ⚠️ Model upload issue. Missing model {weights}.')
155
+ file = None
156
+ url = f'{self.api_url}/upload'
157
+ # url = 'http://httpbin.org/post' # for debug
158
+ data = {'epoch': epoch}
159
+ if final:
160
+ data.update({'type': 'final', 'map': map})
161
+ smart_request('post',
162
+ url,
163
+ data=data,
164
+ files={'best.pt': file},
165
+ headers=self.auth_header,
166
+ retry=10,
167
+ timeout=3600,
168
+ thread=False,
169
+ progress=True,
170
+ code=4)
171
+ else:
172
+ data.update({'type': 'epoch', 'isBest': bool(is_best)})
173
+ smart_request('post', url, data=data, files={'last.pt': file}, headers=self.auth_header, code=3)
174
+
175
+ @threaded
176
+ def _start_heartbeat(self):
177
+ """Begin a threaded heartbeat loop to report the agent's status to Ultralytics HUB."""
178
+ while self.alive:
179
+ r = smart_request('post',
180
+ f'{HUB_API_ROOT}/v1/agent/heartbeat/models/{self.model_id}',
181
+ json={
182
+ 'agent': AGENT_NAME,
183
+ 'agentId': self.agent_id},
184
+ headers=self.auth_header,
185
+ retry=0,
186
+ code=5,
187
+ thread=False) # already in a thread
188
+ self.agent_id = r.json().get('data', {}).get('agentId', None)
189
+ sleep(self.rate_limits['heartbeat'])
modules/ultralytics/hub/utils.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+
3
+ import os
4
+ import platform
5
+ import random
6
+ import sys
7
+ import threading
8
+ import time
9
+ from pathlib import Path
10
+
11
+ import requests
12
+ from tqdm import tqdm
13
+
14
+ from ultralytics.yolo.utils import (ENVIRONMENT, LOGGER, ONLINE, RANK, SETTINGS, TESTS_RUNNING, TQDM_BAR_FORMAT,
15
+ TryExcept, __version__, colorstr, get_git_origin_url, is_colab, is_git_dir,
16
+ is_pip_package)
17
+
18
+ PREFIX = colorstr('Ultralytics HUB: ')
19
+ HELP_MSG = 'If this issue persists please visit https://github.com/ultralytics/hub/issues for assistance.'
20
+ HUB_API_ROOT = os.environ.get('ULTRALYTICS_HUB_API', 'https://api.ultralytics.com')
21
+
22
+
23
+ def request_with_credentials(url: str) -> any:
24
+ """
25
+ Make an AJAX request with cookies attached in a Google Colab environment.
26
+
27
+ Args:
28
+ url (str): The URL to make the request to.
29
+
30
+ Returns:
31
+ (any): The response data from the AJAX request.
32
+
33
+ Raises:
34
+ OSError: If the function is not run in a Google Colab environment.
35
+ """
36
+ if not is_colab():
37
+ raise OSError('request_with_credentials() must run in a Colab environment')
38
+ from google.colab import output # noqa
39
+ from IPython import display # noqa
40
+ display.display(
41
+ display.Javascript("""
42
+ window._hub_tmp = new Promise((resolve, reject) => {
43
+ const timeout = setTimeout(() => reject("Failed authenticating existing browser session"), 5000)
44
+ fetch("%s", {
45
+ method: 'POST',
46
+ credentials: 'include'
47
+ })
48
+ .then((response) => resolve(response.json()))
49
+ .then((json) => {
50
+ clearTimeout(timeout);
51
+ }).catch((err) => {
52
+ clearTimeout(timeout);
53
+ reject(err);
54
+ });
55
+ });
56
+ """ % url))
57
+ return output.eval_js('_hub_tmp')
58
+
59
+
60
+ def requests_with_progress(method, url, **kwargs):
61
+ """
62
+ Make an HTTP request using the specified method and URL, with an optional progress bar.
63
+
64
+ Args:
65
+ method (str): The HTTP method to use (e.g. 'GET', 'POST').
66
+ url (str): The URL to send the request to.
67
+ **kwargs (dict): Additional keyword arguments to pass to the underlying `requests.request` function.
68
+
69
+ Returns:
70
+ (requests.Response): The response object from the HTTP request.
71
+
72
+ Note:
73
+ If 'progress' is set to True, the progress bar will display the download progress
74
+ for responses with a known content length.
75
+ """
76
+ progress = kwargs.pop('progress', False)
77
+ if not progress:
78
+ return requests.request(method, url, **kwargs)
79
+ response = requests.request(method, url, stream=True, **kwargs)
80
+ total = int(response.headers.get('content-length', 0)) # total size
81
+ pbar = tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024, bar_format=TQDM_BAR_FORMAT)
82
+ for data in response.iter_content(chunk_size=1024):
83
+ pbar.update(len(data))
84
+ pbar.close()
85
+ return response
86
+
87
+
88
+ def smart_request(method, url, retry=3, timeout=30, thread=True, code=-1, verbose=True, progress=False, **kwargs):
89
+ """
90
+ Makes an HTTP request using the 'requests' library, with exponential backoff retries up to a specified timeout.
91
+
92
+ Args:
93
+ method (str): The HTTP method to use for the request. Choices are 'post' and 'get'.
94
+ url (str): The URL to make the request to.
95
+ retry (int, optional): Number of retries to attempt before giving up. Default is 3.
96
+ timeout (int, optional): Timeout in seconds after which the function will give up retrying. Default is 30.
97
+ thread (bool, optional): Whether to execute the request in a separate daemon thread. Default is True.
98
+ code (int, optional): An identifier for the request, used for logging purposes. Default is -1.
99
+ verbose (bool, optional): A flag to determine whether to print out to console or not. Default is True.
100
+ progress (bool, optional): Whether to show a progress bar during the request. Default is False.
101
+ **kwargs (dict): Keyword arguments to be passed to the requests function specified in method.
102
+
103
+ Returns:
104
+ (requests.Response): The HTTP response object. If the request is executed in a separate thread, returns None.
105
+ """
106
+ retry_codes = (408, 500) # retry only these codes
107
+
108
+ @TryExcept(verbose=verbose)
109
+ def func(func_method, func_url, **func_kwargs):
110
+ """Make HTTP requests with retries and timeouts, with optional progress tracking."""
111
+ r = None # response
112
+ t0 = time.time() # initial time for timer
113
+ for i in range(retry + 1):
114
+ if (time.time() - t0) > timeout:
115
+ break
116
+ r = requests_with_progress(func_method, func_url, **func_kwargs) # i.e. get(url, data, json, files)
117
+ if r.status_code < 300: # return codes in the 2xx range are generally considered "good" or "successful"
118
+ break
119
+ try:
120
+ m = r.json().get('message', 'No JSON message.')
121
+ except AttributeError:
122
+ m = 'Unable to read JSON.'
123
+ if i == 0:
124
+ if r.status_code in retry_codes:
125
+ m += f' Retrying {retry}x for {timeout}s.' if retry else ''
126
+ elif r.status_code == 429: # rate limit
127
+ h = r.headers # response headers
128
+ m = f"Rate limit reached ({h['X-RateLimit-Remaining']}/{h['X-RateLimit-Limit']}). " \
129
+ f"Please retry after {h['Retry-After']}s."
130
+ if verbose:
131
+ LOGGER.warning(f'{PREFIX}{m} {HELP_MSG} ({r.status_code} #{code})')
132
+ if r.status_code not in retry_codes:
133
+ return r
134
+ time.sleep(2 ** i) # exponential standoff
135
+ return r
136
+
137
+ args = method, url
138
+ kwargs['progress'] = progress
139
+ if thread:
140
+ threading.Thread(target=func, args=args, kwargs=kwargs, daemon=True).start()
141
+ else:
142
+ return func(*args, **kwargs)
143
+
144
+
145
+ class Events:
146
+ """
147
+ A class for collecting anonymous event analytics. Event analytics are enabled when sync=True in settings and
148
+ disabled when sync=False. Run 'yolo settings' to see and update settings YAML file.
149
+
150
+ Attributes:
151
+ url (str): The URL to send anonymous events.
152
+ rate_limit (float): The rate limit in seconds for sending events.
153
+ metadata (dict): A dictionary containing metadata about the environment.
154
+ enabled (bool): A flag to enable or disable Events based on certain conditions.
155
+ """
156
+
157
+ url = 'https://www.google-analytics.com/mp/collect?measurement_id=G-X8NCJYTQXM&api_secret=QLQrATrNSwGRFRLE-cbHJw'
158
+
159
+ def __init__(self):
160
+ """
161
+ Initializes the Events object with default values for events, rate_limit, and metadata.
162
+ """
163
+ self.events = [] # events list
164
+ self.rate_limit = 60.0 # rate limit (seconds)
165
+ self.t = 0.0 # rate limit timer (seconds)
166
+ self.metadata = {
167
+ 'cli': Path(sys.argv[0]).name == 'yolo',
168
+ 'install': 'git' if is_git_dir() else 'pip' if is_pip_package() else 'other',
169
+ 'python': '.'.join(platform.python_version_tuple()[:2]), # i.e. 3.10
170
+ 'version': __version__,
171
+ 'env': ENVIRONMENT,
172
+ 'session_id': round(random.random() * 1E15),
173
+ 'engagement_time_msec': 1000}
174
+ self.enabled = \
175
+ SETTINGS['sync'] and \
176
+ RANK in (-1, 0) and \
177
+ not TESTS_RUNNING and \
178
+ ONLINE and \
179
+ (is_pip_package() or get_git_origin_url() == 'https://github.com/ultralytics/ultralytics.git')
180
+
181
+ def __call__(self, cfg):
182
+ """
183
+ Attempts to add a new event to the events list and send events if the rate limit is reached.
184
+
185
+ Args:
186
+ cfg (IterableSimpleNamespace): The configuration object containing mode and task information.
187
+ """
188
+ if not self.enabled:
189
+ # Events disabled, do nothing
190
+ return
191
+
192
+ # Attempt to add to events
193
+ if len(self.events) < 25: # Events list limited to 25 events (drop any events past this)
194
+ params = {**self.metadata, **{'task': cfg.task}}
195
+ if cfg.mode == 'export':
196
+ params['format'] = cfg.format
197
+ self.events.append({'name': cfg.mode, 'params': params})
198
+
199
+ # Check rate limit
200
+ t = time.time()
201
+ if (t - self.t) < self.rate_limit:
202
+ # Time is under rate limiter, wait to send
203
+ return
204
+
205
+ # Time is over rate limiter, send now
206
+ data = {'client_id': SETTINGS['uuid'], 'events': self.events} # SHA-256 anonymized UUID hash and events list
207
+
208
+ # POST equivalent to requests.post(self.url, json=data)
209
+ smart_request('post', self.url, json=data, retry=0, verbose=False)
210
+
211
+ # Reset events and rate limit timer
212
+ self.events = []
213
+ self.t = t
214
+
215
+
216
+ # Run below code on hub/utils init -------------------------------------------------------------------------------------
217
+ events = Events()
modules/ultralytics/models/README.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Models
2
+
3
+ Welcome to the Ultralytics Models directory! Here you will find a wide variety of pre-configured model configuration
4
+ files (`*.yaml`s) that can be used to create custom YOLO models. The models in this directory have been expertly crafted
5
+ and fine-tuned by the Ultralytics team to provide the best performance for a wide range of object detection and image
6
+ segmentation tasks.
7
+
8
+ These model configurations cover a wide range of scenarios, from simple object detection to more complex tasks like
9
+ instance segmentation and object tracking. They are also designed to run efficiently on a variety of hardware platforms,
10
+ from CPUs to GPUs. Whether you are a seasoned machine learning practitioner or just getting started with YOLO, this
11
+ directory provides a great starting point for your custom model development needs.
12
+
13
+ To get started, simply browse through the models in this directory and find one that best suits your needs. Once you've
14
+ selected a model, you can use the provided `*.yaml` file to train and deploy your custom YOLO model with ease. See full
15
+ details at the Ultralytics [Docs](https://docs.ultralytics.com/models), and if you need help or have any questions, feel free
16
+ to reach out to the Ultralytics team for support. So, don't wait, start creating your custom YOLO model now!
17
+
18
+ ### Usage
19
+
20
+ Model `*.yaml` files may be used directly in the Command Line Interface (CLI) with a `yolo` command:
21
+
22
+ ```bash
23
+ yolo task=detect mode=train model=yolov8n.yaml data=coco128.yaml epochs=100
24
+ ```
25
+
26
+ They may also be used directly in a Python environment, and accepts the same
27
+ [arguments](https://docs.ultralytics.com/usage/cfg/) as in the CLI example above:
28
+
29
+ ```python
30
+ from ultralytics import YOLO
31
+
32
+ model = YOLO("model.yaml") # build a YOLOv8n model from scratch
33
+ # YOLO("model.pt") use pre-trained model if available
34
+ model.info() # display model information
35
+ model.train(data="coco128.yaml", epochs=100) # train the model
36
+ ```
37
+
38
+ ## Pre-trained Model Architectures
39
+
40
+ Ultralytics supports many model architectures. Visit https://docs.ultralytics.com/models to view detailed information
41
+ and usage. Any of these models can be used by loading their configs or pretrained checkpoints if available.
42
+
43
+ ## Contributing New Models
44
+
45
+ If you've developed a new model architecture or have improvements for existing models that you'd like to contribute to the Ultralytics community, please submit your contribution in a new Pull Request. For more details, visit our [Contributing Guide](https://docs.ultralytics.com/help/contributing).
modules/ultralytics/models/rt-detr/rtdetr-l.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # RT-DETR-l object detection model with P3-P5 outputs. For details see https://docs.ultralytics.com/models/rtdetr
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ l: [1.00, 1.00, 1024]
9
+
10
+ backbone:
11
+ # [from, repeats, module, args]
12
+ - [-1, 1, HGStem, [32, 48]] # 0-P2/4
13
+ - [-1, 6, HGBlock, [48, 128, 3]] # stage 1
14
+
15
+ - [-1, 1, DWConv, [128, 3, 2, 1, False]] # 2-P3/8
16
+ - [-1, 6, HGBlock, [96, 512, 3]] # stage 2
17
+
18
+ - [-1, 1, DWConv, [512, 3, 2, 1, False]] # 4-P3/16
19
+ - [-1, 6, HGBlock, [192, 1024, 5, True, False]] # cm, c2, k, light, shortcut
20
+ - [-1, 6, HGBlock, [192, 1024, 5, True, True]]
21
+ - [-1, 6, HGBlock, [192, 1024, 5, True, True]] # stage 3
22
+
23
+ - [-1, 1, DWConv, [1024, 3, 2, 1, False]] # 8-P4/32
24
+ - [-1, 6, HGBlock, [384, 2048, 5, True, False]] # stage 4
25
+
26
+ head:
27
+ - [-1, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 10 input_proj.2
28
+ - [-1, 1, AIFI, [1024, 8]]
29
+ - [-1, 1, Conv, [256, 1, 1]] # 12, Y5, lateral_convs.0
30
+
31
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
32
+ - [7, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 14 input_proj.1
33
+ - [[-2, -1], 1, Concat, [1]]
34
+ - [-1, 3, RepC3, [256]] # 16, fpn_blocks.0
35
+ - [-1, 1, Conv, [256, 1, 1]] # 17, Y4, lateral_convs.1
36
+
37
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
38
+ - [3, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 19 input_proj.0
39
+ - [[-2, -1], 1, Concat, [1]] # cat backbone P4
40
+ - [-1, 3, RepC3, [256]] # X3 (21), fpn_blocks.1
41
+
42
+ - [-1, 1, Conv, [256, 3, 2]] # 22, downsample_convs.0
43
+ - [[-1, 17], 1, Concat, [1]] # cat Y4
44
+ - [-1, 3, RepC3, [256]] # F4 (24), pan_blocks.0
45
+
46
+ - [-1, 1, Conv, [256, 3, 2]] # 25, downsample_convs.1
47
+ - [[-1, 12], 1, Concat, [1]] # cat Y5
48
+ - [-1, 3, RepC3, [256]] # F5 (27), pan_blocks.1
49
+
50
+ - [[21, 24, 27], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5)
modules/ultralytics/models/rt-detr/rtdetr-x.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # RT-DETR-x object detection model with P3-P5 outputs. For details see https://docs.ultralytics.com/models/rtdetr
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ x: [1.00, 1.00, 2048]
9
+
10
+ backbone:
11
+ # [from, repeats, module, args]
12
+ - [-1, 1, HGStem, [32, 64]] # 0-P2/4
13
+ - [-1, 6, HGBlock, [64, 128, 3]] # stage 1
14
+
15
+ - [-1, 1, DWConv, [128, 3, 2, 1, False]] # 2-P3/8
16
+ - [-1, 6, HGBlock, [128, 512, 3]]
17
+ - [-1, 6, HGBlock, [128, 512, 3, False, True]] # 4-stage 2
18
+
19
+ - [-1, 1, DWConv, [512, 3, 2, 1, False]] # 5-P3/16
20
+ - [-1, 6, HGBlock, [256, 1024, 5, True, False]] # cm, c2, k, light, shortcut
21
+ - [-1, 6, HGBlock, [256, 1024, 5, True, True]]
22
+ - [-1, 6, HGBlock, [256, 1024, 5, True, True]]
23
+ - [-1, 6, HGBlock, [256, 1024, 5, True, True]]
24
+ - [-1, 6, HGBlock, [256, 1024, 5, True, True]] # 10-stage 3
25
+
26
+ - [-1, 1, DWConv, [1024, 3, 2, 1, False]] # 11-P4/32
27
+ - [-1, 6, HGBlock, [512, 2048, 5, True, False]]
28
+ - [-1, 6, HGBlock, [512, 2048, 5, True, True]] # 13-stage 4
29
+
30
+ head:
31
+ - [-1, 1, Conv, [384, 1, 1, None, 1, 1, False]] # 14 input_proj.2
32
+ - [-1, 1, AIFI, [2048, 8]]
33
+ - [-1, 1, Conv, [384, 1, 1]] # 16, Y5, lateral_convs.0
34
+
35
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
36
+ - [10, 1, Conv, [384, 1, 1, None, 1, 1, False]] # 18 input_proj.1
37
+ - [[-2, -1], 1, Concat, [1]]
38
+ - [-1, 3, RepC3, [384]] # 20, fpn_blocks.0
39
+ - [-1, 1, Conv, [384, 1, 1]] # 21, Y4, lateral_convs.1
40
+
41
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
42
+ - [4, 1, Conv, [384, 1, 1, None, 1, 1, False]] # 23 input_proj.0
43
+ - [[-2, -1], 1, Concat, [1]] # cat backbone P4
44
+ - [-1, 3, RepC3, [384]] # X3 (25), fpn_blocks.1
45
+
46
+ - [-1, 1, Conv, [384, 3, 2]] # 26, downsample_convs.0
47
+ - [[-1, 21], 1, Concat, [1]] # cat Y4
48
+ - [-1, 3, RepC3, [384]] # F4 (28), pan_blocks.0
49
+
50
+ - [-1, 1, Conv, [384, 3, 2]] # 29, downsample_convs.1
51
+ - [[-1, 16], 1, Concat, [1]] # cat Y5
52
+ - [-1, 3, RepC3, [384]] # F5 (31), pan_blocks.1
53
+
54
+ - [[25, 28, 31], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5)
modules/ultralytics/models/v3/yolov3-spp.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv3-SPP object detection model with P3-P5 outputs. For details see https://docs.ultralytics.com/models/yolov3
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ depth_multiple: 1.0 # model depth multiple
7
+ width_multiple: 1.0 # layer channel multiple
8
+
9
+ # darknet53 backbone
10
+ backbone:
11
+ # [from, number, module, args]
12
+ [[-1, 1, Conv, [32, 3, 1]], # 0
13
+ [-1, 1, Conv, [64, 3, 2]], # 1-P1/2
14
+ [-1, 1, Bottleneck, [64]],
15
+ [-1, 1, Conv, [128, 3, 2]], # 3-P2/4
16
+ [-1, 2, Bottleneck, [128]],
17
+ [-1, 1, Conv, [256, 3, 2]], # 5-P3/8
18
+ [-1, 8, Bottleneck, [256]],
19
+ [-1, 1, Conv, [512, 3, 2]], # 7-P4/16
20
+ [-1, 8, Bottleneck, [512]],
21
+ [-1, 1, Conv, [1024, 3, 2]], # 9-P5/32
22
+ [-1, 4, Bottleneck, [1024]], # 10
23
+ ]
24
+
25
+ # YOLOv3-SPP head
26
+ head:
27
+ [[-1, 1, Bottleneck, [1024, False]],
28
+ [-1, 1, SPP, [512, [5, 9, 13]]],
29
+ [-1, 1, Conv, [1024, 3, 1]],
30
+ [-1, 1, Conv, [512, 1, 1]],
31
+ [-1, 1, Conv, [1024, 3, 1]], # 15 (P5/32-large)
32
+
33
+ [-2, 1, Conv, [256, 1, 1]],
34
+ [-1, 1, nn.Upsample, [None, 2, 'nearest']],
35
+ [[-1, 8], 1, Concat, [1]], # cat backbone P4
36
+ [-1, 1, Bottleneck, [512, False]],
37
+ [-1, 1, Bottleneck, [512, False]],
38
+ [-1, 1, Conv, [256, 1, 1]],
39
+ [-1, 1, Conv, [512, 3, 1]], # 22 (P4/16-medium)
40
+
41
+ [-2, 1, Conv, [128, 1, 1]],
42
+ [-1, 1, nn.Upsample, [None, 2, 'nearest']],
43
+ [[-1, 6], 1, Concat, [1]], # cat backbone P3
44
+ [-1, 1, Bottleneck, [256, False]],
45
+ [-1, 2, Bottleneck, [256, False]], # 27 (P3/8-small)
46
+
47
+ [[27, 22, 15], 1, Detect, [nc]], # Detect(P3, P4, P5)
48
+ ]
modules/ultralytics/models/v3/yolov3-tiny.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv3-tiny object detection model with P4-P5 outputs. For details see https://docs.ultralytics.com/models/yolov3
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ depth_multiple: 1.0 # model depth multiple
7
+ width_multiple: 1.0 # layer channel multiple
8
+
9
+ # YOLOv3-tiny backbone
10
+ backbone:
11
+ # [from, number, module, args]
12
+ [[-1, 1, Conv, [16, 3, 1]], # 0
13
+ [-1, 1, nn.MaxPool2d, [2, 2, 0]], # 1-P1/2
14
+ [-1, 1, Conv, [32, 3, 1]],
15
+ [-1, 1, nn.MaxPool2d, [2, 2, 0]], # 3-P2/4
16
+ [-1, 1, Conv, [64, 3, 1]],
17
+ [-1, 1, nn.MaxPool2d, [2, 2, 0]], # 5-P3/8
18
+ [-1, 1, Conv, [128, 3, 1]],
19
+ [-1, 1, nn.MaxPool2d, [2, 2, 0]], # 7-P4/16
20
+ [-1, 1, Conv, [256, 3, 1]],
21
+ [-1, 1, nn.MaxPool2d, [2, 2, 0]], # 9-P5/32
22
+ [-1, 1, Conv, [512, 3, 1]],
23
+ [-1, 1, nn.ZeroPad2d, [[0, 1, 0, 1]]], # 11
24
+ [-1, 1, nn.MaxPool2d, [2, 1, 0]], # 12
25
+ ]
26
+
27
+ # YOLOv3-tiny head
28
+ head:
29
+ [[-1, 1, Conv, [1024, 3, 1]],
30
+ [-1, 1, Conv, [256, 1, 1]],
31
+ [-1, 1, Conv, [512, 3, 1]], # 15 (P5/32-large)
32
+
33
+ [-2, 1, Conv, [128, 1, 1]],
34
+ [-1, 1, nn.Upsample, [None, 2, 'nearest']],
35
+ [[-1, 8], 1, Concat, [1]], # cat backbone P4
36
+ [-1, 1, Conv, [256, 3, 1]], # 19 (P4/16-medium)
37
+
38
+ [[19, 15], 1, Detect, [nc]], # Detect(P4, P5)
39
+ ]
modules/ultralytics/models/v3/yolov3.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv3 object detection model with P3-P5 outputs. For details see https://docs.ultralytics.com/models/yolov3
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ depth_multiple: 1.0 # model depth multiple
7
+ width_multiple: 1.0 # layer channel multiple
8
+
9
+ # darknet53 backbone
10
+ backbone:
11
+ # [from, number, module, args]
12
+ [[-1, 1, Conv, [32, 3, 1]], # 0
13
+ [-1, 1, Conv, [64, 3, 2]], # 1-P1/2
14
+ [-1, 1, Bottleneck, [64]],
15
+ [-1, 1, Conv, [128, 3, 2]], # 3-P2/4
16
+ [-1, 2, Bottleneck, [128]],
17
+ [-1, 1, Conv, [256, 3, 2]], # 5-P3/8
18
+ [-1, 8, Bottleneck, [256]],
19
+ [-1, 1, Conv, [512, 3, 2]], # 7-P4/16
20
+ [-1, 8, Bottleneck, [512]],
21
+ [-1, 1, Conv, [1024, 3, 2]], # 9-P5/32
22
+ [-1, 4, Bottleneck, [1024]], # 10
23
+ ]
24
+
25
+ # YOLOv3 head
26
+ head:
27
+ [[-1, 1, Bottleneck, [1024, False]],
28
+ [-1, 1, Conv, [512, 1, 1]],
29
+ [-1, 1, Conv, [1024, 3, 1]],
30
+ [-1, 1, Conv, [512, 1, 1]],
31
+ [-1, 1, Conv, [1024, 3, 1]], # 15 (P5/32-large)
32
+
33
+ [-2, 1, Conv, [256, 1, 1]],
34
+ [-1, 1, nn.Upsample, [None, 2, 'nearest']],
35
+ [[-1, 8], 1, Concat, [1]], # cat backbone P4
36
+ [-1, 1, Bottleneck, [512, False]],
37
+ [-1, 1, Bottleneck, [512, False]],
38
+ [-1, 1, Conv, [256, 1, 1]],
39
+ [-1, 1, Conv, [512, 3, 1]], # 22 (P4/16-medium)
40
+
41
+ [-2, 1, Conv, [128, 1, 1]],
42
+ [-1, 1, nn.Upsample, [None, 2, 'nearest']],
43
+ [[-1, 6], 1, Concat, [1]], # cat backbone P3
44
+ [-1, 1, Bottleneck, [256, False]],
45
+ [-1, 2, Bottleneck, [256, False]], # 27 (P3/8-small)
46
+
47
+ [[27, 22, 15], 1, Detect, [nc]], # Detect(P3, P4, P5)
48
+ ]
modules/ultralytics/models/v5/yolov5-p6.yaml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv5 object detection model with P3-P6 outputs. For details see https://docs.ultralytics.com/models/yolov5
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov5n-p6.yaml' will call yolov5-p6.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ n: [0.33, 0.25, 1024]
9
+ s: [0.33, 0.50, 1024]
10
+ m: [0.67, 0.75, 1024]
11
+ l: [1.00, 1.00, 1024]
12
+ x: [1.33, 1.25, 1024]
13
+
14
+ # YOLOv5 v6.0 backbone
15
+ backbone:
16
+ # [from, number, module, args]
17
+ [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
18
+ [-1, 1, Conv, [128, 3, 2]], # 1-P2/4
19
+ [-1, 3, C3, [128]],
20
+ [-1, 1, Conv, [256, 3, 2]], # 3-P3/8
21
+ [-1, 6, C3, [256]],
22
+ [-1, 1, Conv, [512, 3, 2]], # 5-P4/16
23
+ [-1, 9, C3, [512]],
24
+ [-1, 1, Conv, [768, 3, 2]], # 7-P5/32
25
+ [-1, 3, C3, [768]],
26
+ [-1, 1, Conv, [1024, 3, 2]], # 9-P6/64
27
+ [-1, 3, C3, [1024]],
28
+ [-1, 1, SPPF, [1024, 5]], # 11
29
+ ]
30
+
31
+ # YOLOv5 v6.0 head
32
+ head:
33
+ [[-1, 1, Conv, [768, 1, 1]],
34
+ [-1, 1, nn.Upsample, [None, 2, 'nearest']],
35
+ [[-1, 8], 1, Concat, [1]], # cat backbone P5
36
+ [-1, 3, C3, [768, False]], # 15
37
+
38
+ [-1, 1, Conv, [512, 1, 1]],
39
+ [-1, 1, nn.Upsample, [None, 2, 'nearest']],
40
+ [[-1, 6], 1, Concat, [1]], # cat backbone P4
41
+ [-1, 3, C3, [512, False]], # 19
42
+
43
+ [-1, 1, Conv, [256, 1, 1]],
44
+ [-1, 1, nn.Upsample, [None, 2, 'nearest']],
45
+ [[-1, 4], 1, Concat, [1]], # cat backbone P3
46
+ [-1, 3, C3, [256, False]], # 23 (P3/8-small)
47
+
48
+ [-1, 1, Conv, [256, 3, 2]],
49
+ [[-1, 20], 1, Concat, [1]], # cat head P4
50
+ [-1, 3, C3, [512, False]], # 26 (P4/16-medium)
51
+
52
+ [-1, 1, Conv, [512, 3, 2]],
53
+ [[-1, 16], 1, Concat, [1]], # cat head P5
54
+ [-1, 3, C3, [768, False]], # 29 (P5/32-large)
55
+
56
+ [-1, 1, Conv, [768, 3, 2]],
57
+ [[-1, 12], 1, Concat, [1]], # cat head P6
58
+ [-1, 3, C3, [1024, False]], # 32 (P6/64-xlarge)
59
+
60
+ [[23, 26, 29, 32], 1, Detect, [nc]], # Detect(P3, P4, P5, P6)
61
+ ]
modules/ultralytics/models/v5/yolov5.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv5 object detection model with P3-P5 outputs. For details see https://docs.ultralytics.com/models/yolov5
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov5n.yaml' will call yolov5.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ n: [0.33, 0.25, 1024]
9
+ s: [0.33, 0.50, 1024]
10
+ m: [0.67, 0.75, 1024]
11
+ l: [1.00, 1.00, 1024]
12
+ x: [1.33, 1.25, 1024]
13
+
14
+ # YOLOv5 v6.0 backbone
15
+ backbone:
16
+ # [from, number, module, args]
17
+ [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
18
+ [-1, 1, Conv, [128, 3, 2]], # 1-P2/4
19
+ [-1, 3, C3, [128]],
20
+ [-1, 1, Conv, [256, 3, 2]], # 3-P3/8
21
+ [-1, 6, C3, [256]],
22
+ [-1, 1, Conv, [512, 3, 2]], # 5-P4/16
23
+ [-1, 9, C3, [512]],
24
+ [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
25
+ [-1, 3, C3, [1024]],
26
+ [-1, 1, SPPF, [1024, 5]], # 9
27
+ ]
28
+
29
+ # YOLOv5 v6.0 head
30
+ head:
31
+ [[-1, 1, Conv, [512, 1, 1]],
32
+ [-1, 1, nn.Upsample, [None, 2, 'nearest']],
33
+ [[-1, 6], 1, Concat, [1]], # cat backbone P4
34
+ [-1, 3, C3, [512, False]], # 13
35
+
36
+ [-1, 1, Conv, [256, 1, 1]],
37
+ [-1, 1, nn.Upsample, [None, 2, 'nearest']],
38
+ [[-1, 4], 1, Concat, [1]], # cat backbone P3
39
+ [-1, 3, C3, [256, False]], # 17 (P3/8-small)
40
+
41
+ [-1, 1, Conv, [256, 3, 2]],
42
+ [[-1, 14], 1, Concat, [1]], # cat head P4
43
+ [-1, 3, C3, [512, False]], # 20 (P4/16-medium)
44
+
45
+ [-1, 1, Conv, [512, 3, 2]],
46
+ [[-1, 10], 1, Concat, [1]], # cat head P5
47
+ [-1, 3, C3, [1024, False]], # 23 (P5/32-large)
48
+
49
+ [[17, 20, 23], 1, Detect, [nc]], # Detect(P3, P4, P5)
50
+ ]
modules/ultralytics/models/v6/yolov6.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv6 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/models/yolov6
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ activation: nn.ReLU() # (optional) model default activation function
7
+ scales: # model compound scaling constants, i.e. 'model=yolov6n.yaml' will call yolov8.yaml with scale 'n'
8
+ # [depth, width, max_channels]
9
+ n: [0.33, 0.25, 1024]
10
+ s: [0.33, 0.50, 1024]
11
+ m: [0.67, 0.75, 768]
12
+ l: [1.00, 1.00, 512]
13
+ x: [1.00, 1.25, 512]
14
+
15
+ # YOLOv6-3.0s backbone
16
+ backbone:
17
+ # [from, repeats, module, args]
18
+ - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
19
+ - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
20
+ - [-1, 6, Conv, [128, 3, 1]]
21
+ - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
22
+ - [-1, 12, Conv, [256, 3, 1]]
23
+ - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
24
+ - [-1, 18, Conv, [512, 3, 1]]
25
+ - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
26
+ - [-1, 6, Conv, [1024, 3, 1]]
27
+ - [-1, 1, SPPF, [1024, 5]] # 9
28
+
29
+ # YOLOv6-3.0s head
30
+ head:
31
+ - [-1, 1, Conv, [256, 1, 1]]
32
+ - [-1, 1, nn.ConvTranspose2d, [256, 2, 2, 0]]
33
+ - [[-1, 6], 1, Concat, [1]] # cat backbone P4
34
+ - [-1, 1, Conv, [256, 3, 1]]
35
+ - [-1, 9, Conv, [256, 3, 1]] # 14
36
+
37
+ - [-1, 1, Conv, [128, 1, 1]]
38
+ - [-1, 1, nn.ConvTranspose2d, [128, 2, 2, 0]]
39
+ - [[-1, 4], 1, Concat, [1]] # cat backbone P3
40
+ - [-1, 1, Conv, [128, 3, 1]]
41
+ - [-1, 9, Conv, [128, 3, 1]] # 19
42
+
43
+ - [-1, 1, Conv, [128, 3, 2]]
44
+ - [[-1, 15], 1, Concat, [1]] # cat head P4
45
+ - [-1, 1, Conv, [256, 3, 1]]
46
+ - [-1, 9, Conv, [256, 3, 1]] # 23
47
+
48
+ - [-1, 1, Conv, [256, 3, 2]]
49
+ - [[-1, 10], 1, Concat, [1]] # cat head P5
50
+ - [-1, 1, Conv, [512, 3, 1]]
51
+ - [-1, 9, Conv, [512, 3, 1]] # 27
52
+
53
+ - [[19, 23, 27], 1, Detect, [nc]] # Detect(P3, P4, P5)
modules/ultralytics/models/v8/yolov8-cls.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv8-cls image classification model. For Usage examples see https://docs.ultralytics.com/tasks/classify
3
+
4
+ # Parameters
5
+ nc: 1000 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ n: [0.33, 0.25, 1024]
9
+ s: [0.33, 0.50, 1024]
10
+ m: [0.67, 0.75, 1024]
11
+ l: [1.00, 1.00, 1024]
12
+ x: [1.00, 1.25, 1024]
13
+
14
+ # YOLOv8.0n backbone
15
+ backbone:
16
+ # [from, repeats, module, args]
17
+ - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
18
+ - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
19
+ - [-1, 3, C2f, [128, True]]
20
+ - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
21
+ - [-1, 6, C2f, [256, True]]
22
+ - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
23
+ - [-1, 6, C2f, [512, True]]
24
+ - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
25
+ - [-1, 3, C2f, [1024, True]]
26
+
27
+ # YOLOv8.0n head
28
+ head:
29
+ - [-1, 1, Classify, [nc]] # Classify
modules/ultralytics/models/v8/yolov8-p2.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv8 object detection model with P2-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ n: [0.33, 0.25, 1024]
9
+ s: [0.33, 0.50, 1024]
10
+ m: [0.67, 0.75, 768]
11
+ l: [1.00, 1.00, 512]
12
+ x: [1.00, 1.25, 512]
13
+
14
+ # YOLOv8.0 backbone
15
+ backbone:
16
+ # [from, repeats, module, args]
17
+ - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
18
+ - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
19
+ - [-1, 3, C2f, [128, True]]
20
+ - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
21
+ - [-1, 6, C2f, [256, True]]
22
+ - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
23
+ - [-1, 6, C2f, [512, True]]
24
+ - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
25
+ - [-1, 3, C2f, [1024, True]]
26
+ - [-1, 1, SPPF, [1024, 5]] # 9
27
+
28
+ # YOLOv8.0-p2 head
29
+ head:
30
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
31
+ - [[-1, 6], 1, Concat, [1]] # cat backbone P4
32
+ - [-1, 3, C2f, [512]] # 12
33
+
34
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
35
+ - [[-1, 4], 1, Concat, [1]] # cat backbone P3
36
+ - [-1, 3, C2f, [256]] # 15 (P3/8-small)
37
+
38
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
39
+ - [[-1, 2], 1, Concat, [1]] # cat backbone P2
40
+ - [-1, 3, C2f, [128]] # 18 (P2/4-xsmall)
41
+
42
+ - [-1, 1, Conv, [128, 3, 2]]
43
+ - [[-1, 15], 1, Concat, [1]] # cat head P3
44
+ - [-1, 3, C2f, [256]] # 21 (P3/8-small)
45
+
46
+ - [-1, 1, Conv, [256, 3, 2]]
47
+ - [[-1, 12], 1, Concat, [1]] # cat head P4
48
+ - [-1, 3, C2f, [512]] # 24 (P4/16-medium)
49
+
50
+ - [-1, 1, Conv, [512, 3, 2]]
51
+ - [[-1, 9], 1, Concat, [1]] # cat head P5
52
+ - [-1, 3, C2f, [1024]] # 27 (P5/32-large)
53
+
54
+ - [[18, 21, 24, 27], 1, Detect, [nc]] # Detect(P2, P3, P4, P5)
modules/ultralytics/models/v8/yolov8-p6.yaml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv8 object detection model with P3-P6 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov8n-p6.yaml' will call yolov8-p6.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ n: [0.33, 0.25, 1024]
9
+ s: [0.33, 0.50, 1024]
10
+ m: [0.67, 0.75, 768]
11
+ l: [1.00, 1.00, 512]
12
+ x: [1.00, 1.25, 512]
13
+
14
+ # YOLOv8.0x6 backbone
15
+ backbone:
16
+ # [from, repeats, module, args]
17
+ - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
18
+ - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
19
+ - [-1, 3, C2f, [128, True]]
20
+ - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
21
+ - [-1, 6, C2f, [256, True]]
22
+ - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
23
+ - [-1, 6, C2f, [512, True]]
24
+ - [-1, 1, Conv, [768, 3, 2]] # 7-P5/32
25
+ - [-1, 3, C2f, [768, True]]
26
+ - [-1, 1, Conv, [1024, 3, 2]] # 9-P6/64
27
+ - [-1, 3, C2f, [1024, True]]
28
+ - [-1, 1, SPPF, [1024, 5]] # 11
29
+
30
+ # YOLOv8.0x6 head
31
+ head:
32
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
33
+ - [[-1, 8], 1, Concat, [1]] # cat backbone P5
34
+ - [-1, 3, C2, [768, False]] # 14
35
+
36
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
37
+ - [[-1, 6], 1, Concat, [1]] # cat backbone P4
38
+ - [-1, 3, C2, [512, False]] # 17
39
+
40
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
41
+ - [[-1, 4], 1, Concat, [1]] # cat backbone P3
42
+ - [-1, 3, C2, [256, False]] # 20 (P3/8-small)
43
+
44
+ - [-1, 1, Conv, [256, 3, 2]]
45
+ - [[-1, 17], 1, Concat, [1]] # cat head P4
46
+ - [-1, 3, C2, [512, False]] # 23 (P4/16-medium)
47
+
48
+ - [-1, 1, Conv, [512, 3, 2]]
49
+ - [[-1, 14], 1, Concat, [1]] # cat head P5
50
+ - [-1, 3, C2, [768, False]] # 26 (P5/32-large)
51
+
52
+ - [-1, 1, Conv, [768, 3, 2]]
53
+ - [[-1, 11], 1, Concat, [1]] # cat head P6
54
+ - [-1, 3, C2, [1024, False]] # 29 (P6/64-xlarge)
55
+
56
+ - [[20, 23, 26, 29], 1, Detect, [nc]] # Detect(P3, P4, P5, P6)
modules/ultralytics/models/v8/yolov8-pose-p6.yaml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv8-pose keypoints/pose estimation model. For Usage examples see https://docs.ultralytics.com/tasks/pose
3
+
4
+ # Parameters
5
+ nc: 1 # number of classes
6
+ kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
7
+ scales: # model compound scaling constants, i.e. 'model=yolov8n-p6.yaml' will call yolov8-p6.yaml with scale 'n'
8
+ # [depth, width, max_channels]
9
+ n: [0.33, 0.25, 1024]
10
+ s: [0.33, 0.50, 1024]
11
+ m: [0.67, 0.75, 768]
12
+ l: [1.00, 1.00, 512]
13
+ x: [1.00, 1.25, 512]
14
+
15
+ # YOLOv8.0x6 backbone
16
+ backbone:
17
+ # [from, repeats, module, args]
18
+ - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
19
+ - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
20
+ - [-1, 3, C2f, [128, True]]
21
+ - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
22
+ - [-1, 6, C2f, [256, True]]
23
+ - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
24
+ - [-1, 6, C2f, [512, True]]
25
+ - [-1, 1, Conv, [768, 3, 2]] # 7-P5/32
26
+ - [-1, 3, C2f, [768, True]]
27
+ - [-1, 1, Conv, [1024, 3, 2]] # 9-P6/64
28
+ - [-1, 3, C2f, [1024, True]]
29
+ - [-1, 1, SPPF, [1024, 5]] # 11
30
+
31
+ # YOLOv8.0x6 head
32
+ head:
33
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
34
+ - [[-1, 8], 1, Concat, [1]] # cat backbone P5
35
+ - [-1, 3, C2, [768, False]] # 14
36
+
37
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
38
+ - [[-1, 6], 1, Concat, [1]] # cat backbone P4
39
+ - [-1, 3, C2, [512, False]] # 17
40
+
41
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
42
+ - [[-1, 4], 1, Concat, [1]] # cat backbone P3
43
+ - [-1, 3, C2, [256, False]] # 20 (P3/8-small)
44
+
45
+ - [-1, 1, Conv, [256, 3, 2]]
46
+ - [[-1, 17], 1, Concat, [1]] # cat head P4
47
+ - [-1, 3, C2, [512, False]] # 23 (P4/16-medium)
48
+
49
+ - [-1, 1, Conv, [512, 3, 2]]
50
+ - [[-1, 14], 1, Concat, [1]] # cat head P5
51
+ - [-1, 3, C2, [768, False]] # 26 (P5/32-large)
52
+
53
+ - [-1, 1, Conv, [768, 3, 2]]
54
+ - [[-1, 11], 1, Concat, [1]] # cat head P6
55
+ - [-1, 3, C2, [1024, False]] # 29 (P6/64-xlarge)
56
+
57
+ - [[20, 23, 26, 29], 1, Pose, [nc, kpt_shape]] # Pose(P3, P4, P5, P6)
modules/ultralytics/models/v8/yolov8-pose.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv8-pose keypoints/pose estimation model. For Usage examples see https://docs.ultralytics.com/tasks/pose
3
+
4
+ # Parameters
5
+ nc: 1 # number of classes
6
+ kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
7
+ scales: # model compound scaling constants, i.e. 'model=yolov8n-pose.yaml' will call yolov8-pose.yaml with scale 'n'
8
+ # [depth, width, max_channels]
9
+ n: [0.33, 0.25, 1024]
10
+ s: [0.33, 0.50, 1024]
11
+ m: [0.67, 0.75, 768]
12
+ l: [1.00, 1.00, 512]
13
+ x: [1.00, 1.25, 512]
14
+
15
+ # YOLOv8.0n backbone
16
+ backbone:
17
+ # [from, repeats, module, args]
18
+ - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
19
+ - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
20
+ - [-1, 3, C2f, [128, True]]
21
+ - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
22
+ - [-1, 6, C2f, [256, True]]
23
+ - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
24
+ - [-1, 6, C2f, [512, True]]
25
+ - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
26
+ - [-1, 3, C2f, [1024, True]]
27
+ - [-1, 1, SPPF, [1024, 5]] # 9
28
+
29
+ # YOLOv8.0n head
30
+ head:
31
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
32
+ - [[-1, 6], 1, Concat, [1]] # cat backbone P4
33
+ - [-1, 3, C2f, [512]] # 12
34
+
35
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
36
+ - [[-1, 4], 1, Concat, [1]] # cat backbone P3
37
+ - [-1, 3, C2f, [256]] # 15 (P3/8-small)
38
+
39
+ - [-1, 1, Conv, [256, 3, 2]]
40
+ - [[-1, 12], 1, Concat, [1]] # cat head P4
41
+ - [-1, 3, C2f, [512]] # 18 (P4/16-medium)
42
+
43
+ - [-1, 1, Conv, [512, 3, 2]]
44
+ - [[-1, 9], 1, Concat, [1]] # cat head P5
45
+ - [-1, 3, C2f, [1024]] # 21 (P5/32-large)
46
+
47
+ - [[15, 18, 21], 1, Pose, [nc, kpt_shape]] # Pose(P3, P4, P5)
modules/ultralytics/models/v8/yolov8-rtdetr.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv8 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ n: [0.33, 0.25, 1024] # YOLOv8n summary: 225 layers, 3157200 parameters, 3157184 gradients, 8.9 GFLOPs
9
+ s: [0.33, 0.50, 1024] # YOLOv8s summary: 225 layers, 11166560 parameters, 11166544 gradients, 28.8 GFLOPs
10
+ m: [0.67, 0.75, 768] # YOLOv8m summary: 295 layers, 25902640 parameters, 25902624 gradients, 79.3 GFLOPs
11
+ l: [1.00, 1.00, 512] # YOLOv8l summary: 365 layers, 43691520 parameters, 43691504 gradients, 165.7 GFLOPs
12
+ x: [1.00, 1.25, 512] # YOLOv8x summary: 365 layers, 68229648 parameters, 68229632 gradients, 258.5 GFLOPs
13
+
14
+ # YOLOv8.0n backbone
15
+ backbone:
16
+ # [from, repeats, module, args]
17
+ - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
18
+ - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
19
+ - [-1, 3, C2f, [128, True]]
20
+ - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
21
+ - [-1, 6, C2f, [256, True]]
22
+ - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
23
+ - [-1, 6, C2f, [512, True]]
24
+ - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
25
+ - [-1, 3, C2f, [1024, True]]
26
+ - [-1, 1, SPPF, [1024, 5]] # 9
27
+
28
+ # YOLOv8.0n head
29
+ head:
30
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
31
+ - [[-1, 6], 1, Concat, [1]] # cat backbone P4
32
+ - [-1, 3, C2f, [512]] # 12
33
+
34
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
35
+ - [[-1, 4], 1, Concat, [1]] # cat backbone P3
36
+ - [-1, 3, C2f, [256]] # 15 (P3/8-small)
37
+
38
+ - [-1, 1, Conv, [256, 3, 2]]
39
+ - [[-1, 12], 1, Concat, [1]] # cat head P4
40
+ - [-1, 3, C2f, [512]] # 18 (P4/16-medium)
41
+
42
+ - [-1, 1, Conv, [512, 3, 2]]
43
+ - [[-1, 9], 1, Concat, [1]] # cat head P5
44
+ - [-1, 3, C2f, [1024]] # 21 (P5/32-large)
45
+
46
+ - [[15, 18, 21], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5)
modules/ultralytics/models/v8/yolov8-seg.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv8-seg instance segmentation model. For Usage examples see https://docs.ultralytics.com/tasks/segment
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov8n-seg.yaml' will call yolov8-seg.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ n: [0.33, 0.25, 1024]
9
+ s: [0.33, 0.50, 1024]
10
+ m: [0.67, 0.75, 768]
11
+ l: [1.00, 1.00, 512]
12
+ x: [1.00, 1.25, 512]
13
+
14
+ # YOLOv8.0n backbone
15
+ backbone:
16
+ # [from, repeats, module, args]
17
+ - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
18
+ - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
19
+ - [-1, 3, C2f, [128, True]]
20
+ - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
21
+ - [-1, 6, C2f, [256, True]]
22
+ - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
23
+ - [-1, 6, C2f, [512, True]]
24
+ - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
25
+ - [-1, 3, C2f, [1024, True]]
26
+ - [-1, 1, SPPF, [1024, 5]] # 9
27
+
28
+ # YOLOv8.0n head
29
+ head:
30
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
31
+ - [[-1, 6], 1, Concat, [1]] # cat backbone P4
32
+ - [-1, 3, C2f, [512]] # 12
33
+
34
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
35
+ - [[-1, 4], 1, Concat, [1]] # cat backbone P3
36
+ - [-1, 3, C2f, [256]] # 15 (P3/8-small)
37
+
38
+ - [-1, 1, Conv, [256, 3, 2]]
39
+ - [[-1, 12], 1, Concat, [1]] # cat head P4
40
+ - [-1, 3, C2f, [512]] # 18 (P4/16-medium)
41
+
42
+ - [-1, 1, Conv, [512, 3, 2]]
43
+ - [[-1, 9], 1, Concat, [1]] # cat head P5
44
+ - [-1, 3, C2f, [1024]] # 21 (P5/32-large)
45
+
46
+ - [[15, 18, 21], 1, Segment, [nc, 32, 256]] # Segment(P3, P4, P5)
modules/ultralytics/models/v8/yolov8.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv8 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ n: [0.33, 0.25, 1024] # YOLOv8n summary: 225 layers, 3157200 parameters, 3157184 gradients, 8.9 GFLOPs
9
+ s: [0.33, 0.50, 1024] # YOLOv8s summary: 225 layers, 11166560 parameters, 11166544 gradients, 28.8 GFLOPs
10
+ m: [0.67, 0.75, 768] # YOLOv8m summary: 295 layers, 25902640 parameters, 25902624 gradients, 79.3 GFLOPs
11
+ l: [1.00, 1.00, 512] # YOLOv8l summary: 365 layers, 43691520 parameters, 43691504 gradients, 165.7 GFLOPs
12
+ x: [1.00, 1.25, 512] # YOLOv8x summary: 365 layers, 68229648 parameters, 68229632 gradients, 258.5 GFLOPs
13
+
14
+ # YOLOv8.0n backbone
15
+ backbone:
16
+ # [from, repeats, module, args]
17
+ - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
18
+ - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
19
+ - [-1, 3, C2f, [128, True]]
20
+ - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
21
+ - [-1, 6, C2f, [256, True]]
22
+ - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
23
+ - [-1, 6, C2f, [512, True]]
24
+ - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
25
+ - [-1, 3, C2f, [1024, True]]
26
+ - [-1, 1, SPPF, [1024, 5]] # 9
27
+
28
+ # YOLOv8.0n head
29
+ head:
30
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
31
+ - [[-1, 6], 1, Concat, [1]] # cat backbone P4
32
+ - [-1, 3, C2f, [512]] # 12
33
+
34
+ - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
35
+ - [[-1, 4], 1, Concat, [1]] # cat backbone P3
36
+ - [-1, 3, C2f, [256]] # 15 (P3/8-small)
37
+
38
+ - [-1, 1, Conv, [256, 3, 2]]
39
+ - [[-1, 12], 1, Concat, [1]] # cat head P4
40
+ - [-1, 3, C2f, [512]] # 18 (P4/16-medium)
41
+
42
+ - [-1, 1, Conv, [512, 3, 2]]
43
+ - [[-1, 9], 1, Concat, [1]] # cat head P5
44
+ - [-1, 3, C2f, [1024]] # 21 (P5/32-large)
45
+
46
+ - [[15, 18, 21], 1, Detect, [nc]] # Detect(P3, P4, P5)
modules/ultralytics/nn/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+
3
+ from .tasks import (BaseModel, ClassificationModel, DetectionModel, SegmentationModel, attempt_load_one_weight,
4
+ attempt_load_weights, guess_model_scale, guess_model_task, parse_model, torch_safe_load,
5
+ yaml_model_load)
6
+
7
+ __all__ = ('attempt_load_one_weight', 'attempt_load_weights', 'parse_model', 'yaml_model_load', 'guess_model_task',
8
+ 'guess_model_scale', 'torch_safe_load', 'DetectionModel', 'SegmentationModel', 'ClassificationModel',
9
+ 'BaseModel')
modules/ultralytics/nn/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (576 Bytes). View file
 
modules/ultralytics/nn/__pycache__/autobackend.cpython-312.pyc ADDED
Binary file (31.8 kB). View file
 
modules/ultralytics/nn/__pycache__/tasks.cpython-312.pyc ADDED
Binary file (49.3 kB). View file
 
modules/ultralytics/nn/autobackend.py ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+
3
+ import ast
4
+ import contextlib
5
+ import json
6
+ import platform
7
+ import zipfile
8
+ from collections import OrderedDict, namedtuple
9
+ from pathlib import Path
10
+ from urllib.parse import urlparse
11
+
12
+ import cv2
13
+ import numpy as np
14
+ import torch
15
+ import torch.nn as nn
16
+ from PIL import Image
17
+
18
+ from ultralytics.yolo.utils import LINUX, LOGGER, ROOT, yaml_load
19
+ from ultralytics.yolo.utils.checks import check_requirements, check_suffix, check_version, check_yaml
20
+ from ultralytics.yolo.utils.downloads import attempt_download_asset, is_url
21
+ from ultralytics.yolo.utils.ops import xywh2xyxy
22
+
23
+
24
+ def check_class_names(names):
25
+ """Check class names. Map imagenet class codes to human-readable names if required. Convert lists to dicts."""
26
+ if isinstance(names, list): # names is a list
27
+ names = dict(enumerate(names)) # convert to dict
28
+ if isinstance(names, dict):
29
+ # Convert 1) string keys to int, i.e. '0' to 0, and non-string values to strings, i.e. True to 'True'
30
+ names = {int(k): str(v) for k, v in names.items()}
31
+ n = len(names)
32
+ if max(names.keys()) >= n:
33
+ raise KeyError(f'{n}-class dataset requires class indices 0-{n - 1}, but you have invalid class indices '
34
+ f'{min(names.keys())}-{max(names.keys())} defined in your dataset YAML.')
35
+ if isinstance(names[0], str) and names[0].startswith('n0'): # imagenet class codes, i.e. 'n01440764'
36
+ map = yaml_load(ROOT / 'datasets/ImageNet.yaml')['map'] # human-readable names
37
+ names = {k: map[v] for k, v in names.items()}
38
+ return names
39
+
40
+
41
+ class AutoBackend(nn.Module):
42
+
43
+ def __init__(self,
44
+ weights='yolov8n.pt',
45
+ device=torch.device('cpu'),
46
+ dnn=False,
47
+ data=None,
48
+ fp16=False,
49
+ fuse=True,
50
+ verbose=True):
51
+ """
52
+ MultiBackend class for python inference on various platforms using Ultralytics YOLO.
53
+
54
+ Args:
55
+ weights (str): The path to the weights file. Default: 'yolov8n.pt'
56
+ device (torch.device): The device to run the model on.
57
+ dnn (bool): Use OpenCV DNN module for inference if True, defaults to False.
58
+ data (str | Path | optional): Additional data.yaml file for class names.
59
+ fp16 (bool): If True, use half precision. Default: False
60
+ fuse (bool): Whether to fuse the model or not. Default: True
61
+ verbose (bool): Whether to run in verbose mode or not. Default: True
62
+
63
+ Supported formats and their naming conventions:
64
+ | Format | Suffix |
65
+ |-----------------------|------------------|
66
+ | PyTorch | *.pt |
67
+ | TorchScript | *.torchscript |
68
+ | ONNX Runtime | *.onnx |
69
+ | ONNX OpenCV DNN | *.onnx dnn=True |
70
+ | OpenVINO | *.xml |
71
+ | CoreML | *.mlmodel |
72
+ | TensorRT | *.engine |
73
+ | TensorFlow SavedModel | *_saved_model |
74
+ | TensorFlow GraphDef | *.pb |
75
+ | TensorFlow Lite | *.tflite |
76
+ | TensorFlow Edge TPU | *_edgetpu.tflite |
77
+ | PaddlePaddle | *_paddle_model |
78
+ """
79
+ super().__init__()
80
+ w = str(weights[0] if isinstance(weights, list) else weights)
81
+ nn_module = isinstance(weights, torch.nn.Module)
82
+ pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle, triton = self._model_type(w)
83
+ fp16 &= pt or jit or onnx or engine or nn_module or triton # FP16
84
+ nhwc = coreml or saved_model or pb or tflite or edgetpu # BHWC formats (vs torch BCWH)
85
+ stride = 32 # default stride
86
+ model, metadata = None, None
87
+ cuda = torch.cuda.is_available() and device.type != 'cpu' # use CUDA
88
+ if not (pt or triton or nn_module):
89
+ w = attempt_download_asset(w) # download if not local
90
+
91
+ # NOTE: special case: in-memory pytorch model
92
+ if nn_module:
93
+ model = weights.to(device)
94
+ model = model.fuse(verbose=verbose) if fuse else model
95
+ if hasattr(model, 'kpt_shape'):
96
+ kpt_shape = model.kpt_shape # pose-only
97
+ stride = max(int(model.stride.max()), 32) # model stride
98
+ names = model.module.names if hasattr(model, 'module') else model.names # get class names
99
+ model.half() if fp16 else model.float()
100
+ self.model = model # explicitly assign for to(), cpu(), cuda(), half()
101
+ pt = True
102
+ elif pt: # PyTorch
103
+ from ultralytics.nn.tasks import attempt_load_weights
104
+ model = attempt_load_weights(weights if isinstance(weights, list) else w,
105
+ device=device,
106
+ inplace=True,
107
+ fuse=fuse)
108
+ if hasattr(model, 'kpt_shape'):
109
+ kpt_shape = model.kpt_shape # pose-only
110
+ stride = max(int(model.stride.max()), 32) # model stride
111
+ names = model.module.names if hasattr(model, 'module') else model.names # get class names
112
+ model.half() if fp16 else model.float()
113
+ self.model = model # explicitly assign for to(), cpu(), cuda(), half()
114
+ elif jit: # TorchScript
115
+ LOGGER.info(f'Loading {w} for TorchScript inference...')
116
+ extra_files = {'config.txt': ''} # model metadata
117
+ model = torch.jit.load(w, _extra_files=extra_files, map_location=device)
118
+ model.half() if fp16 else model.float()
119
+ if extra_files['config.txt']: # load metadata dict
120
+ metadata = json.loads(extra_files['config.txt'], object_hook=lambda x: dict(x.items()))
121
+ elif dnn: # ONNX OpenCV DNN
122
+ LOGGER.info(f'Loading {w} for ONNX OpenCV DNN inference...')
123
+ check_requirements('opencv-python>=4.5.4')
124
+ net = cv2.dnn.readNetFromONNX(w)
125
+ elif onnx: # ONNX Runtime
126
+ LOGGER.info(f'Loading {w} for ONNX Runtime inference...')
127
+ check_requirements(('onnx', 'onnxruntime-gpu' if cuda else 'onnxruntime'))
128
+ import onnxruntime
129
+ providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if cuda else ['CPUExecutionProvider']
130
+ session = onnxruntime.InferenceSession(w, providers=providers)
131
+ output_names = [x.name for x in session.get_outputs()]
132
+ metadata = session.get_modelmeta().custom_metadata_map # metadata
133
+ elif xml: # OpenVINO
134
+ LOGGER.info(f'Loading {w} for OpenVINO inference...')
135
+ check_requirements('openvino') # requires openvino-dev: https://pypi.org/project/openvino-dev/
136
+ from openvino.runtime import Core, Layout, get_batch # noqa
137
+ ie = Core()
138
+ w = Path(w)
139
+ if not w.is_file(): # if not *.xml
140
+ w = next(w.glob('*.xml')) # get *.xml file from *_openvino_model dir
141
+ network = ie.read_model(model=str(w), weights=w.with_suffix('.bin'))
142
+ if network.get_parameters()[0].get_layout().empty:
143
+ network.get_parameters()[0].set_layout(Layout('NCHW'))
144
+ batch_dim = get_batch(network)
145
+ if batch_dim.is_static:
146
+ batch_size = batch_dim.get_length()
147
+ executable_network = ie.compile_model(network, device_name='CPU') # device_name="MYRIAD" for NCS2
148
+ metadata = w.parent / 'metadata.yaml'
149
+ elif engine: # TensorRT
150
+ LOGGER.info(f'Loading {w} for TensorRT inference...')
151
+ try:
152
+ import tensorrt as trt # noqa https://developer.nvidia.com/nvidia-tensorrt-download
153
+ except ImportError:
154
+ if LINUX:
155
+ check_requirements('nvidia-tensorrt', cmds='-U --index-url https://pypi.ngc.nvidia.com')
156
+ import tensorrt as trt # noqa
157
+ check_version(trt.__version__, '7.0.0', hard=True) # require tensorrt>=7.0.0
158
+ if device.type == 'cpu':
159
+ device = torch.device('cuda:0')
160
+ Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
161
+ logger = trt.Logger(trt.Logger.INFO)
162
+ # Read file
163
+ with open(w, 'rb') as f, trt.Runtime(logger) as runtime:
164
+ meta_len = int.from_bytes(f.read(4), byteorder='little') # read metadata length
165
+ metadata = json.loads(f.read(meta_len).decode('utf-8')) # read metadata
166
+ model = runtime.deserialize_cuda_engine(f.read()) # read engine
167
+ context = model.create_execution_context()
168
+ bindings = OrderedDict()
169
+ output_names = []
170
+ fp16 = False # default updated below
171
+ dynamic = False
172
+ for i in range(model.num_bindings):
173
+ name = model.get_binding_name(i)
174
+ dtype = trt.nptype(model.get_binding_dtype(i))
175
+ if model.binding_is_input(i):
176
+ if -1 in tuple(model.get_binding_shape(i)): # dynamic
177
+ dynamic = True
178
+ context.set_binding_shape(i, tuple(model.get_profile_shape(0, i)[2]))
179
+ if dtype == np.float16:
180
+ fp16 = True
181
+ else: # output
182
+ output_names.append(name)
183
+ shape = tuple(context.get_binding_shape(i))
184
+ im = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
185
+ bindings[name] = Binding(name, dtype, shape, im, int(im.data_ptr()))
186
+ binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
187
+ batch_size = bindings['images'].shape[0] # if dynamic, this is instead max batch size
188
+ elif coreml: # CoreML
189
+ LOGGER.info(f'Loading {w} for CoreML inference...')
190
+ import coremltools as ct
191
+ model = ct.models.MLModel(w)
192
+ metadata = dict(model.user_defined_metadata)
193
+ elif saved_model: # TF SavedModel
194
+ LOGGER.info(f'Loading {w} for TensorFlow SavedModel inference...')
195
+ import tensorflow as tf
196
+ keras = False # assume TF1 saved_model
197
+ model = tf.keras.models.load_model(w) if keras else tf.saved_model.load(w)
198
+ metadata = Path(w) / 'metadata.yaml'
199
+ elif pb: # GraphDef https://www.tensorflow.org/guide/migrate#a_graphpb_or_graphpbtxt
200
+ LOGGER.info(f'Loading {w} for TensorFlow GraphDef inference...')
201
+ import tensorflow as tf
202
+
203
+ from ultralytics.yolo.engine.exporter import gd_outputs
204
+
205
+ def wrap_frozen_graph(gd, inputs, outputs):
206
+ """Wrap frozen graphs for deployment."""
207
+ x = tf.compat.v1.wrap_function(lambda: tf.compat.v1.import_graph_def(gd, name=''), []) # wrapped
208
+ ge = x.graph.as_graph_element
209
+ return x.prune(tf.nest.map_structure(ge, inputs), tf.nest.map_structure(ge, outputs))
210
+
211
+ gd = tf.Graph().as_graph_def() # TF GraphDef
212
+ with open(w, 'rb') as f:
213
+ gd.ParseFromString(f.read())
214
+ frozen_func = wrap_frozen_graph(gd, inputs='x:0', outputs=gd_outputs(gd))
215
+ elif tflite or edgetpu: # https://www.tensorflow.org/lite/guide/python#install_tensorflow_lite_for_python
216
+ try: # https://coral.ai/docs/edgetpu/tflite-python/#update-existing-tf-lite-code-for-the-edge-tpu
217
+ from tflite_runtime.interpreter import Interpreter, load_delegate
218
+ except ImportError:
219
+ import tensorflow as tf
220
+ Interpreter, load_delegate = tf.lite.Interpreter, tf.lite.experimental.load_delegate
221
+ if edgetpu: # TF Edge TPU https://coral.ai/software/#edgetpu-runtime
222
+ LOGGER.info(f'Loading {w} for TensorFlow Lite Edge TPU inference...')
223
+ delegate = {
224
+ 'Linux': 'libedgetpu.so.1',
225
+ 'Darwin': 'libedgetpu.1.dylib',
226
+ 'Windows': 'edgetpu.dll'}[platform.system()]
227
+ interpreter = Interpreter(model_path=w, experimental_delegates=[load_delegate(delegate)])
228
+ else: # TFLite
229
+ LOGGER.info(f'Loading {w} for TensorFlow Lite inference...')
230
+ interpreter = Interpreter(model_path=w) # load TFLite model
231
+ interpreter.allocate_tensors() # allocate
232
+ input_details = interpreter.get_input_details() # inputs
233
+ output_details = interpreter.get_output_details() # outputs
234
+ # Load metadata
235
+ with contextlib.suppress(zipfile.BadZipFile):
236
+ with zipfile.ZipFile(w, 'r') as model:
237
+ meta_file = model.namelist()[0]
238
+ metadata = ast.literal_eval(model.read(meta_file).decode('utf-8'))
239
+ elif tfjs: # TF.js
240
+ raise NotImplementedError('YOLOv8 TF.js inference is not supported')
241
+ elif paddle: # PaddlePaddle
242
+ LOGGER.info(f'Loading {w} for PaddlePaddle inference...')
243
+ check_requirements('paddlepaddle-gpu' if cuda else 'paddlepaddle')
244
+ import paddle.inference as pdi # noqa
245
+ w = Path(w)
246
+ if not w.is_file(): # if not *.pdmodel
247
+ w = next(w.rglob('*.pdmodel')) # get *.pdmodel file from *_paddle_model dir
248
+ config = pdi.Config(str(w), str(w.with_suffix('.pdiparams')))
249
+ if cuda:
250
+ config.enable_use_gpu(memory_pool_init_size_mb=2048, device_id=0)
251
+ predictor = pdi.create_predictor(config)
252
+ input_handle = predictor.get_input_handle(predictor.get_input_names()[0])
253
+ output_names = predictor.get_output_names()
254
+ metadata = w.parents[1] / 'metadata.yaml'
255
+ elif triton: # NVIDIA Triton Inference Server
256
+ LOGGER.info('Triton Inference Server not supported...')
257
+ '''
258
+ TODO:
259
+ check_requirements('tritonclient[all]')
260
+ from utils.triton import TritonRemoteModel
261
+ model = TritonRemoteModel(url=w)
262
+ nhwc = model.runtime.startswith("tensorflow")
263
+ '''
264
+ else:
265
+ from ultralytics.yolo.engine.exporter import export_formats
266
+ raise TypeError(f"model='{w}' is not a supported model format. "
267
+ 'See https://docs.ultralytics.com/modes/predict for help.'
268
+ f'\n\n{export_formats()}')
269
+
270
+ # Load external metadata YAML
271
+ if isinstance(metadata, (str, Path)) and Path(metadata).exists():
272
+ metadata = yaml_load(metadata)
273
+ if metadata:
274
+ for k, v in metadata.items():
275
+ if k in ('stride', 'batch'):
276
+ metadata[k] = int(v)
277
+ elif k in ('imgsz', 'names', 'kpt_shape') and isinstance(v, str):
278
+ metadata[k] = eval(v)
279
+ stride = metadata['stride']
280
+ task = metadata['task']
281
+ batch = metadata['batch']
282
+ imgsz = metadata['imgsz']
283
+ names = metadata['names']
284
+ kpt_shape = metadata.get('kpt_shape')
285
+ elif not (pt or triton or nn_module):
286
+ LOGGER.warning(f"WARNING ⚠️ Metadata not found for 'model={weights}'")
287
+
288
+ # Check names
289
+ if 'names' not in locals(): # names missing
290
+ names = self._apply_default_class_names(data)
291
+ names = check_class_names(names)
292
+
293
+ self.__dict__.update(locals()) # assign all variables to self
294
+
295
+ def forward(self, im, augment=False, visualize=False):
296
+ """
297
+ Runs inference on the YOLOv8 MultiBackend model.
298
+
299
+ Args:
300
+ im (torch.Tensor): The image tensor to perform inference on.
301
+ augment (bool): whether to perform data augmentation during inference, defaults to False
302
+ visualize (bool): whether to visualize the output predictions, defaults to False
303
+
304
+ Returns:
305
+ (tuple): Tuple containing the raw output tensor, and processed output for visualization (if visualize=True)
306
+ """
307
+ b, ch, h, w = im.shape # batch, channel, height, width
308
+ if self.fp16 and im.dtype != torch.float16:
309
+ im = im.half() # to FP16
310
+ if self.nhwc:
311
+ im = im.permute(0, 2, 3, 1) # torch BCHW to numpy BHWC shape(1,320,192,3)
312
+
313
+ if self.pt or self.nn_module: # PyTorch
314
+ y = self.model(im, augment=augment, visualize=visualize) if augment or visualize else self.model(im)
315
+ elif self.jit: # TorchScript
316
+ y = self.model(im)
317
+ elif self.dnn: # ONNX OpenCV DNN
318
+ im = im.cpu().numpy() # torch to numpy
319
+ self.net.setInput(im)
320
+ y = self.net.forward()
321
+ elif self.onnx: # ONNX Runtime
322
+ im = im.cpu().numpy() # torch to numpy
323
+ y = self.session.run(self.output_names, {self.session.get_inputs()[0].name: im})
324
+ elif self.xml: # OpenVINO
325
+ im = im.cpu().numpy() # FP32
326
+ y = list(self.executable_network([im]).values())
327
+ elif self.engine: # TensorRT
328
+ if self.dynamic and im.shape != self.bindings['images'].shape:
329
+ i = self.model.get_binding_index('images')
330
+ self.context.set_binding_shape(i, im.shape) # reshape if dynamic
331
+ self.bindings['images'] = self.bindings['images']._replace(shape=im.shape)
332
+ for name in self.output_names:
333
+ i = self.model.get_binding_index(name)
334
+ self.bindings[name].data.resize_(tuple(self.context.get_binding_shape(i)))
335
+ s = self.bindings['images'].shape
336
+ assert im.shape == s, f"input size {im.shape} {'>' if self.dynamic else 'not equal to'} max model size {s}"
337
+ self.binding_addrs['images'] = int(im.data_ptr())
338
+ self.context.execute_v2(list(self.binding_addrs.values()))
339
+ y = [self.bindings[x].data for x in sorted(self.output_names)]
340
+ elif self.coreml: # CoreML
341
+ im = im[0].cpu().numpy()
342
+ im_pil = Image.fromarray((im * 255).astype('uint8'))
343
+ # im = im.resize((192, 320), Image.ANTIALIAS)
344
+ y = self.model.predict({'image': im_pil}) # coordinates are xywh normalized
345
+ if 'confidence' in y:
346
+ box = xywh2xyxy(y['coordinates'] * [[w, h, w, h]]) # xyxy pixels
347
+ conf, cls = y['confidence'].max(1), y['confidence'].argmax(1).astype(np.float)
348
+ y = np.concatenate((box, conf.reshape(-1, 1), cls.reshape(-1, 1)), 1)
349
+ elif len(y) == 1: # classification model
350
+ y = list(y.values())
351
+ elif len(y) == 2: # segmentation model
352
+ y = list(reversed(y.values())) # reversed for segmentation models (pred, proto)
353
+ elif self.paddle: # PaddlePaddle
354
+ im = im.cpu().numpy().astype(np.float32)
355
+ self.input_handle.copy_from_cpu(im)
356
+ self.predictor.run()
357
+ y = [self.predictor.get_output_handle(x).copy_to_cpu() for x in self.output_names]
358
+ elif self.triton: # NVIDIA Triton Inference Server
359
+ y = self.model(im)
360
+ else: # TensorFlow (SavedModel, GraphDef, Lite, Edge TPU)
361
+ im = im.cpu().numpy()
362
+ if self.saved_model: # SavedModel
363
+ y = self.model(im, training=False) if self.keras else self.model(im)
364
+ if not isinstance(y, list):
365
+ y = [y]
366
+ elif self.pb: # GraphDef
367
+ y = self.frozen_func(x=self.tf.constant(im))
368
+ if len(y) == 2 and len(self.names) == 999: # segments and names not defined
369
+ ip, ib = (0, 1) if len(y[0].shape) == 4 else (1, 0) # index of protos, boxes
370
+ nc = y[ib].shape[1] - y[ip].shape[3] - 4 # y = (1, 160, 160, 32), (1, 116, 8400)
371
+ self.names = {i: f'class{i}' for i in range(nc)}
372
+ else: # Lite or Edge TPU
373
+ input = self.input_details[0]
374
+ int8 = input['dtype'] == np.int8 # is TFLite quantized int8 model
375
+ if int8:
376
+ scale, zero_point = input['quantization']
377
+ im = (im / scale + zero_point).astype(np.int8) # de-scale
378
+ self.interpreter.set_tensor(input['index'], im)
379
+ self.interpreter.invoke()
380
+ y = []
381
+ for output in self.output_details:
382
+ x = self.interpreter.get_tensor(output['index'])
383
+ if int8:
384
+ scale, zero_point = output['quantization']
385
+ x = (x.astype(np.float32) - zero_point) * scale # re-scale
386
+ y.append(x)
387
+ # TF segment fixes: export is reversed vs ONNX export and protos are transposed
388
+ if len(y) == 2: # segment with (det, proto) output order reversed
389
+ if len(y[1].shape) != 4:
390
+ y = list(reversed(y)) # should be y = (1, 116, 8400), (1, 160, 160, 32)
391
+ y[1] = np.transpose(y[1], (0, 3, 1, 2)) # should be y = (1, 116, 8400), (1, 32, 160, 160)
392
+ y = [x if isinstance(x, np.ndarray) else x.numpy() for x in y]
393
+ # y[0][..., :4] *= [w, h, w, h] # xywh normalized to pixels
394
+
395
+ # for x in y:
396
+ # print(type(x), len(x)) if isinstance(x, (list, tuple)) else print(type(x), x.shape) # debug shapes
397
+ if isinstance(y, (list, tuple)):
398
+ return self.from_numpy(y[0]) if len(y) == 1 else [self.from_numpy(x) for x in y]
399
+ else:
400
+ return self.from_numpy(y)
401
+
402
+ def from_numpy(self, x):
403
+ """
404
+ Convert a numpy array to a tensor.
405
+
406
+ Args:
407
+ x (np.ndarray): The array to be converted.
408
+
409
+ Returns:
410
+ (torch.Tensor): The converted tensor
411
+ """
412
+ return torch.tensor(x).to(self.device) if isinstance(x, np.ndarray) else x
413
+
414
+ def warmup(self, imgsz=(1, 3, 640, 640)):
415
+ """
416
+ Warm up the model by running one forward pass with a dummy input.
417
+
418
+ Args:
419
+ imgsz (tuple): The shape of the dummy input tensor in the format (batch_size, channels, height, width)
420
+
421
+ Returns:
422
+ (None): This method runs the forward pass and don't return any value
423
+ """
424
+ warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton, self.nn_module
425
+ if any(warmup_types) and (self.device.type != 'cpu' or self.triton):
426
+ im = torch.empty(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device) # input
427
+ for _ in range(2 if self.jit else 1): #
428
+ self.forward(im) # warmup
429
+
430
+ @staticmethod
431
+ def _apply_default_class_names(data):
432
+ """Applies default class names to an input YAML file or returns numerical class names."""
433
+ with contextlib.suppress(Exception):
434
+ return yaml_load(check_yaml(data))['names']
435
+ return {i: f'class{i}' for i in range(999)} # return default if above errors
436
+
437
+ @staticmethod
438
+ def _model_type(p='path/to/model.pt'):
439
+ """
440
+ This function takes a path to a model file and returns the model type
441
+
442
+ Args:
443
+ p: path to the model file. Defaults to path/to/model.pt
444
+ """
445
+ # Return model type from model path, i.e. path='path/to/model.onnx' -> type=onnx
446
+ # types = [pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle]
447
+ from ultralytics.yolo.engine.exporter import export_formats
448
+ sf = list(export_formats().Suffix) # export suffixes
449
+ if not is_url(p, check=False) and not isinstance(p, str):
450
+ check_suffix(p, sf) # checks
451
+ url = urlparse(p) # if url may be Triton inference server
452
+ types = [s in Path(p).name for s in sf]
453
+ types[8] &= not types[9] # tflite &= not edgetpu
454
+ triton = not any(types) and all([any(s in url.scheme for s in ['http', 'grpc']), url.netloc])
455
+ return types + [triton]
modules/ultralytics/nn/autoshape.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ """
3
+ Common modules
4
+ """
5
+
6
+ from copy import copy
7
+ from pathlib import Path
8
+
9
+ import cv2
10
+ import numpy as np
11
+ import requests
12
+ import torch
13
+ import torch.nn as nn
14
+ from PIL import Image, ImageOps
15
+ from torch.cuda import amp
16
+
17
+ from ultralytics.nn.autobackend import AutoBackend
18
+ from ultralytics.yolo.data.augment import LetterBox
19
+ from ultralytics.yolo.utils import LOGGER, colorstr
20
+ from ultralytics.yolo.utils.files import increment_path
21
+ from ultralytics.yolo.utils.ops import Profile, make_divisible, non_max_suppression, scale_boxes, xyxy2xywh
22
+ from ultralytics.yolo.utils.plotting import Annotator, colors, save_one_box
23
+ from ultralytics.yolo.utils.torch_utils import copy_attr, smart_inference_mode
24
+
25
+
26
+ class AutoShape(nn.Module):
27
+ """YOLOv8 input-robust model wrapper for passing cv2/np/PIL/torch inputs. Includes preprocessing, inference and NMS."""
28
+ conf = 0.25 # NMS confidence threshold
29
+ iou = 0.45 # NMS IoU threshold
30
+ agnostic = False # NMS class-agnostic
31
+ multi_label = False # NMS multiple labels per box
32
+ classes = None # (optional list) filter by class, i.e. = [0, 15, 16] for COCO persons, cats and dogs
33
+ max_det = 1000 # maximum number of detections per image
34
+ amp = False # Automatic Mixed Precision (AMP) inference
35
+
36
+ def __init__(self, model, verbose=True):
37
+ """Initializes object and copies attributes from model object."""
38
+ super().__init__()
39
+ if verbose:
40
+ LOGGER.info('Adding AutoShape... ')
41
+ copy_attr(self, model, include=('yaml', 'nc', 'hyp', 'names', 'stride', 'abc'), exclude=()) # copy attributes
42
+ self.dmb = isinstance(model, AutoBackend) # DetectMultiBackend() instance
43
+ self.pt = not self.dmb or model.pt # PyTorch model
44
+ self.model = model.eval()
45
+ if self.pt:
46
+ m = self.model.model.model[-1] if self.dmb else self.model.model[-1] # Detect()
47
+ m.inplace = False # Detect.inplace=False for safe multithread inference
48
+ m.export = True # do not output loss values
49
+
50
+ def _apply(self, fn):
51
+ """Apply to(), cpu(), cuda(), half() to model tensors that are not parameters or registered buffers."""
52
+ self = super()._apply(fn)
53
+ if self.pt:
54
+ m = self.model.model.model[-1] if self.dmb else self.model.model[-1] # Detect()
55
+ m.stride = fn(m.stride)
56
+ m.grid = list(map(fn, m.grid))
57
+ if isinstance(m.anchor_grid, list):
58
+ m.anchor_grid = list(map(fn, m.anchor_grid))
59
+ return self
60
+
61
+ @smart_inference_mode()
62
+ def forward(self, ims, size=640, augment=False, profile=False):
63
+ """Inference from various sources. For size(height=640, width=1280), RGB images example inputs are:."""
64
+ # file: ims = 'data/images/zidane.jpg' # str or PosixPath
65
+ # URI: = 'https://ultralytics.com/images/zidane.jpg'
66
+ # OpenCV: = cv2.imread('image.jpg')[:,:,::-1] # HWC BGR to RGB x(640,1280,3)
67
+ # PIL: = Image.open('image.jpg') or ImageGrab.grab() # HWC x(640,1280,3)
68
+ # numpy: = np.zeros((640,1280,3)) # HWC
69
+ # torch: = torch.zeros(16,3,320,640) # BCHW (scaled to size=640, 0-1 values)
70
+ # multiple: = [Image.open('image1.jpg'), Image.open('image2.jpg'), ...] # list of images
71
+
72
+ dt = (Profile(), Profile(), Profile())
73
+ with dt[0]:
74
+ if isinstance(size, int): # expand
75
+ size = (size, size)
76
+ p = next(self.model.parameters()) if self.pt else torch.empty(1, device=self.model.device) # param
77
+ autocast = self.amp and (p.device.type != 'cpu') # Automatic Mixed Precision (AMP) inference
78
+ if isinstance(ims, torch.Tensor): # torch
79
+ with amp.autocast(autocast):
80
+ return self.model(ims.to(p.device).type_as(p), augment=augment) # inference
81
+
82
+ # Preprocess
83
+ n, ims = (len(ims), list(ims)) if isinstance(ims, (list, tuple)) else (1, [ims]) # number, list of images
84
+ shape0, shape1, files = [], [], [] # image and inference shapes, filenames
85
+ for i, im in enumerate(ims):
86
+ f = f'image{i}' # filename
87
+ if isinstance(im, (str, Path)): # filename or uri
88
+ im, f = Image.open(requests.get(im, stream=True).raw if str(im).startswith('http') else im), im
89
+ im = np.asarray(ImageOps.exif_transpose(im))
90
+ elif isinstance(im, Image.Image): # PIL Image
91
+ im, f = np.asarray(ImageOps.exif_transpose(im)), getattr(im, 'filename', f) or f
92
+ files.append(Path(f).with_suffix('.jpg').name)
93
+ if im.shape[0] < 5: # image in CHW
94
+ im = im.transpose((1, 2, 0)) # reverse dataloader .transpose(2, 0, 1)
95
+ im = im[..., :3] if im.ndim == 3 else cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) # enforce 3ch input
96
+ s = im.shape[:2] # HWC
97
+ shape0.append(s) # image shape
98
+ g = max(size) / max(s) # gain
99
+ shape1.append([y * g for y in s])
100
+ ims[i] = im if im.data.contiguous else np.ascontiguousarray(im) # update
101
+ shape1 = [make_divisible(x, self.stride) for x in np.array(shape1).max(0)] if self.pt else size # inf shape
102
+ x = [LetterBox(shape1, auto=False)(image=im)['img'] for im in ims] # pad
103
+ x = np.ascontiguousarray(np.array(x).transpose((0, 3, 1, 2))) # stack and BHWC to BCHW
104
+ x = torch.from_numpy(x).to(p.device).type_as(p) / 255 # uint8 to fp16/32
105
+
106
+ with amp.autocast(autocast):
107
+ # Inference
108
+ with dt[1]:
109
+ y = self.model(x, augment=augment) # forward
110
+
111
+ # Postprocess
112
+ with dt[2]:
113
+ y = non_max_suppression(y if self.dmb else y[0],
114
+ self.conf,
115
+ self.iou,
116
+ self.classes,
117
+ self.agnostic,
118
+ self.multi_label,
119
+ max_det=self.max_det) # NMS
120
+ for i in range(n):
121
+ scale_boxes(shape1, y[i][:, :4], shape0[i])
122
+
123
+ return Detections(ims, y, files, dt, self.names, x.shape)
124
+
125
+
126
+ class Detections:
127
+ """ YOLOv8 detections class for inference results"""
128
+
129
+ def __init__(self, ims, pred, files, times=(0, 0, 0), names=None, shape=None):
130
+ """Initialize object attributes for YOLO detection results."""
131
+ super().__init__()
132
+ d = pred[0].device # device
133
+ gn = [torch.tensor([*(im.shape[i] for i in [1, 0, 1, 0]), 1, 1], device=d) for im in ims] # normalizations
134
+ self.ims = ims # list of images as numpy arrays
135
+ self.pred = pred # list of tensors pred[0] = (xyxy, conf, cls)
136
+ self.names = names # class names
137
+ self.files = files # image filenames
138
+ self.times = times # profiling times
139
+ self.xyxy = pred # xyxy pixels
140
+ self.xywh = [xyxy2xywh(x) for x in pred] # xywh pixels
141
+ self.xyxyn = [x / g for x, g in zip(self.xyxy, gn)] # xyxy normalized
142
+ self.xywhn = [x / g for x, g in zip(self.xywh, gn)] # xywh normalized
143
+ self.n = len(self.pred) # number of images (batch size)
144
+ self.t = tuple(x.t / self.n * 1E3 for x in times) # timestamps (ms)
145
+ self.s = tuple(shape) # inference BCHW shape
146
+
147
+ def _run(self, pprint=False, show=False, save=False, crop=False, render=False, labels=True, save_dir=Path('')):
148
+ """Return performance metrics and optionally cropped/save images or results."""
149
+ s, crops = '', []
150
+ for i, (im, pred) in enumerate(zip(self.ims, self.pred)):
151
+ s += f'\nimage {i + 1}/{len(self.pred)}: {im.shape[0]}x{im.shape[1]} ' # string
152
+ if pred.shape[0]:
153
+ for c in pred[:, -1].unique():
154
+ n = (pred[:, -1] == c).sum() # detections per class
155
+ s += f"{n} {self.names[int(c)]}{'s' * (n > 1)}, " # add to string
156
+ s = s.rstrip(', ')
157
+ if show or save or render or crop:
158
+ annotator = Annotator(im, example=str(self.names))
159
+ for *box, conf, cls in reversed(pred): # xyxy, confidence, class
160
+ label = f'{self.names[int(cls)]} {conf:.2f}'
161
+ if crop:
162
+ file = save_dir / 'crops' / self.names[int(cls)] / self.files[i] if save else None
163
+ crops.append({
164
+ 'box': box,
165
+ 'conf': conf,
166
+ 'cls': cls,
167
+ 'label': label,
168
+ 'im': save_one_box(box, im, file=file, save=save)})
169
+ else: # all others
170
+ annotator.box_label(box, label if labels else '', color=colors(cls))
171
+ im = annotator.im
172
+ else:
173
+ s += '(no detections)'
174
+
175
+ im = Image.fromarray(im.astype(np.uint8)) if isinstance(im, np.ndarray) else im # from np
176
+ if show:
177
+ im.show(self.files[i]) # show
178
+ if save:
179
+ f = self.files[i]
180
+ im.save(save_dir / f) # save
181
+ if i == self.n - 1:
182
+ LOGGER.info(f"Saved {self.n} image{'s' * (self.n > 1)} to {colorstr('bold', save_dir)}")
183
+ if render:
184
+ self.ims[i] = np.asarray(im)
185
+ if pprint:
186
+ s = s.lstrip('\n')
187
+ return f'{s}\nSpeed: %.1fms preprocess, %.1fms inference, %.1fms NMS per image at shape {self.s}' % self.t
188
+ if crop:
189
+ if save:
190
+ LOGGER.info(f'Saved results to {save_dir}\n')
191
+ return crops
192
+
193
+ def show(self, labels=True):
194
+ """Displays YOLO results with detected bounding boxes."""
195
+ self._run(show=True, labels=labels) # show results
196
+
197
+ def save(self, labels=True, save_dir='runs/detect/exp', exist_ok=False):
198
+ """Save detection results with optional labels to specified directory."""
199
+ save_dir = increment_path(save_dir, exist_ok, mkdir=True) # increment save_dir
200
+ self._run(save=True, labels=labels, save_dir=save_dir) # save results
201
+
202
+ def crop(self, save=True, save_dir='runs/detect/exp', exist_ok=False):
203
+ """Crops images into detections and saves them if 'save' is True."""
204
+ save_dir = increment_path(save_dir, exist_ok, mkdir=True) if save else None
205
+ return self._run(crop=True, save=save, save_dir=save_dir) # crop results
206
+
207
+ def render(self, labels=True):
208
+ """Renders detected objects and returns images."""
209
+ self._run(render=True, labels=labels) # render results
210
+ return self.ims
211
+
212
+ def pandas(self):
213
+ """Return detections as pandas DataFrames, i.e. print(results.pandas().xyxy[0])."""
214
+ import pandas
215
+ new = copy(self) # return copy
216
+ ca = 'xmin', 'ymin', 'xmax', 'ymax', 'confidence', 'class', 'name' # xyxy columns
217
+ cb = 'xcenter', 'ycenter', 'width', 'height', 'confidence', 'class', 'name' # xywh columns
218
+ for k, c in zip(['xyxy', 'xyxyn', 'xywh', 'xywhn'], [ca, ca, cb, cb]):
219
+ a = [[x[:5] + [int(x[5]), self.names[int(x[5])]] for x in x.tolist()] for x in getattr(self, k)] # update
220
+ setattr(new, k, [pandas.DataFrame(x, columns=c) for x in a])
221
+ return new
222
+
223
+ def tolist(self):
224
+ """Return a list of Detections objects, i.e. 'for result in results.tolist():'."""
225
+ r = range(self.n) # iterable
226
+ x = [Detections([self.ims[i]], [self.pred[i]], [self.files[i]], self.times, self.names, self.s) for i in r]
227
+ # for d in x:
228
+ # for k in ['ims', 'pred', 'xyxy', 'xyxyn', 'xywh', 'xywhn']:
229
+ # setattr(d, k, getattr(d, k)[0]) # pop out of list
230
+ return x
231
+
232
+ def print(self):
233
+ """Print the results of the `self._run()` function."""
234
+ LOGGER.info(self.__str__())
235
+
236
+ def __len__(self): # override len(results)
237
+ return self.n
238
+
239
+ def __str__(self): # override print(results)
240
+ return self._run(pprint=True) # print results
241
+
242
+ def __repr__(self):
243
+ """Returns a printable representation of the object."""
244
+ return f'YOLOv8 {self.__class__} instance\n' + self.__str__()
modules/ultralytics/nn/modules/__init__.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ """
3
+ Ultralytics modules. Visualize with:
4
+
5
+ from ultralytics.nn.modules import *
6
+ import torch
7
+ import os
8
+
9
+ x = torch.ones(1, 128, 40, 40)
10
+ m = Conv(128, 128)
11
+ f = f'{m._get_name()}.onnx'
12
+ torch.onnx.export(m, x, f)
13
+ os.system(f'onnxsim {f} {f} && open {f}')
14
+ """
15
+
16
+ from .block import (C1, C2, C3, C3TR, DFL, SPP, SPPF, Bottleneck, BottleneckCSP, C2f, C3Ghost, C3x, GhostBottleneck,
17
+ HGBlock, HGStem, Proto, RepC3)
18
+ from .conv import (CBAM, ChannelAttention, Concat, Conv, Conv2, ConvTranspose, DWConv, DWConvTranspose2d, Focus,
19
+ GhostConv, LightConv, RepConv, SpatialAttention)
20
+ from .head import Classify, Detect, Pose, RTDETRDecoder, Segment
21
+ from .transformer import (AIFI, MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer, LayerNorm2d,
22
+ MLPBlock, MSDeformAttn, TransformerBlock, TransformerEncoderLayer, TransformerLayer)
23
+
24
+ __all__ = ('Conv', 'Conv2', 'LightConv', 'RepConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus',
25
+ 'GhostConv', 'ChannelAttention', 'SpatialAttention', 'CBAM', 'Concat', 'TransformerLayer',
26
+ 'TransformerBlock', 'MLPBlock', 'LayerNorm2d', 'DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3',
27
+ 'C2f', 'C3x', 'C3TR', 'C3Ghost', 'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'Detect',
28
+ 'Segment', 'Pose', 'Classify', 'TransformerEncoderLayer', 'RepC3', 'RTDETRDecoder', 'AIFI',
29
+ 'DeformableTransformerDecoder', 'DeformableTransformerDecoderLayer', 'MSDeformAttn', 'MLP')
modules/ultralytics/nn/modules/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (1.73 kB). View file
 
modules/ultralytics/nn/modules/__pycache__/block.cpython-312.pyc ADDED
Binary file (24.9 kB). View file
 
modules/ultralytics/nn/modules/__pycache__/conv.cpython-312.pyc ADDED
Binary file (21.5 kB). View file
 
modules/ultralytics/nn/modules/__pycache__/head.cpython-312.pyc ADDED
Binary file (23.8 kB). View file
 
modules/ultralytics/nn/modules/__pycache__/transformer.cpython-312.pyc ADDED
Binary file (26.6 kB). View file
 
modules/ultralytics/nn/modules/__pycache__/utils.cpython-312.pyc ADDED
Binary file (4.27 kB). View file
 
modules/ultralytics/nn/modules/block.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ """
3
+ Block modules
4
+ """
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+
10
+ from .conv import Conv, DWConv, GhostConv, LightConv, RepConv
11
+ from .transformer import TransformerBlock
12
+
13
+ __all__ = ('DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3', 'C2f', 'C3x', 'C3TR', 'C3Ghost',
14
+ 'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'RepC3')
15
+
16
+
17
+ class DFL(nn.Module):
18
+ """
19
+ Integral module of Distribution Focal Loss (DFL).
20
+ Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
21
+ """
22
+
23
+ def __init__(self, c1=16):
24
+ """Initialize a convolutional layer with a given number of input channels."""
25
+ super().__init__()
26
+ self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
27
+ x = torch.arange(c1, dtype=torch.float)
28
+ self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1))
29
+ self.c1 = c1
30
+
31
+ def forward(self, x):
32
+ """Applies a transformer layer on input tensor 'x' and returns a tensor."""
33
+ b, c, a = x.shape # batch, channels, anchors
34
+ return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
35
+ # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)
36
+
37
+
38
+ class Proto(nn.Module):
39
+ """YOLOv8 mask Proto module for segmentation models."""
40
+
41
+ def __init__(self, c1, c_=256, c2=32): # ch_in, number of protos, number of masks
42
+ super().__init__()
43
+ self.cv1 = Conv(c1, c_, k=3)
44
+ self.upsample = nn.ConvTranspose2d(c_, c_, 2, 2, 0, bias=True) # nn.Upsample(scale_factor=2, mode='nearest')
45
+ self.cv2 = Conv(c_, c_, k=3)
46
+ self.cv3 = Conv(c_, c2)
47
+
48
+ def forward(self, x):
49
+ """Performs a forward pass through layers using an upsampled input image."""
50
+ return self.cv3(self.cv2(self.upsample(self.cv1(x))))
51
+
52
+
53
+ class HGStem(nn.Module):
54
+ """StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.
55
+ https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
56
+ """
57
+
58
+ def __init__(self, c1, cm, c2):
59
+ super().__init__()
60
+ self.stem1 = Conv(c1, cm, 3, 2, act=nn.ReLU())
61
+ self.stem2a = Conv(cm, cm // 2, 2, 1, 0, act=nn.ReLU())
62
+ self.stem2b = Conv(cm // 2, cm, 2, 1, 0, act=nn.ReLU())
63
+ self.stem3 = Conv(cm * 2, cm, 3, 2, act=nn.ReLU())
64
+ self.stem4 = Conv(cm, c2, 1, 1, act=nn.ReLU())
65
+ self.pool = nn.MaxPool2d(kernel_size=2, stride=1, padding=0, ceil_mode=True)
66
+
67
+ def forward(self, x):
68
+ """Forward pass of a PPHGNetV2 backbone layer."""
69
+ x = self.stem1(x)
70
+ x = F.pad(x, [0, 1, 0, 1])
71
+ x2 = self.stem2a(x)
72
+ x2 = F.pad(x2, [0, 1, 0, 1])
73
+ x2 = self.stem2b(x2)
74
+ x1 = self.pool(x)
75
+ x = torch.cat([x1, x2], dim=1)
76
+ x = self.stem3(x)
77
+ x = self.stem4(x)
78
+ return x
79
+
80
+
81
+ class HGBlock(nn.Module):
82
+ """HG_Block of PPHGNetV2 with 2 convolutions and LightConv.
83
+ https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
84
+ """
85
+
86
+ def __init__(self, c1, cm, c2, k=3, n=6, lightconv=False, shortcut=False, act=nn.ReLU()):
87
+ super().__init__()
88
+ block = LightConv if lightconv else Conv
89
+ self.m = nn.ModuleList(block(c1 if i == 0 else cm, cm, k=k, act=act) for i in range(n))
90
+ self.sc = Conv(c1 + n * cm, c2 // 2, 1, 1, act=act) # squeeze conv
91
+ self.ec = Conv(c2 // 2, c2, 1, 1, act=act) # excitation conv
92
+ self.add = shortcut and c1 == c2
93
+
94
+ def forward(self, x):
95
+ """Forward pass of a PPHGNetV2 backbone layer."""
96
+ y = [x]
97
+ y.extend(m(y[-1]) for m in self.m)
98
+ y = self.ec(self.sc(torch.cat(y, 1)))
99
+ return y + x if self.add else y
100
+
101
+
102
+ class SPP(nn.Module):
103
+ """Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729."""
104
+
105
+ def __init__(self, c1, c2, k=(5, 9, 13)):
106
+ """Initialize the SPP layer with input/output channels and pooling kernel sizes."""
107
+ super().__init__()
108
+ c_ = c1 // 2 # hidden channels
109
+ self.cv1 = Conv(c1, c_, 1, 1)
110
+ self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
111
+ self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
112
+
113
+ def forward(self, x):
114
+ """Forward pass of the SPP layer, performing spatial pyramid pooling."""
115
+ x = self.cv1(x)
116
+ return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
117
+
118
+
119
+ class SPPF(nn.Module):
120
+ """Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher."""
121
+
122
+ def __init__(self, c1, c2, k=5): # equivalent to SPP(k=(5, 9, 13))
123
+ super().__init__()
124
+ c_ = c1 // 2 # hidden channels
125
+ self.cv1 = Conv(c1, c_, 1, 1)
126
+ self.cv2 = Conv(c_ * 4, c2, 1, 1)
127
+ self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
128
+
129
+ def forward(self, x):
130
+ """Forward pass through Ghost Convolution block."""
131
+ x = self.cv1(x)
132
+ y1 = self.m(x)
133
+ y2 = self.m(y1)
134
+ return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))
135
+
136
+
137
+ class C1(nn.Module):
138
+ """CSP Bottleneck with 1 convolution."""
139
+
140
+ def __init__(self, c1, c2, n=1): # ch_in, ch_out, number
141
+ super().__init__()
142
+ self.cv1 = Conv(c1, c2, 1, 1)
143
+ self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n)))
144
+
145
+ def forward(self, x):
146
+ """Applies cross-convolutions to input in the C3 module."""
147
+ y = self.cv1(x)
148
+ return self.m(y) + y
149
+
150
+
151
+ class C2(nn.Module):
152
+ """CSP Bottleneck with 2 convolutions."""
153
+
154
+ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
155
+ super().__init__()
156
+ self.c = int(c2 * e) # hidden channels
157
+ self.cv1 = Conv(c1, 2 * self.c, 1, 1)
158
+ self.cv2 = Conv(2 * self.c, c2, 1) # optional act=FReLU(c2)
159
+ # self.attention = ChannelAttention(2 * self.c) # or SpatialAttention()
160
+ self.m = nn.Sequential(*(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)))
161
+
162
+ def forward(self, x):
163
+ """Forward pass through the CSP bottleneck with 2 convolutions."""
164
+ a, b = self.cv1(x).chunk(2, 1)
165
+ return self.cv2(torch.cat((self.m(a), b), 1))
166
+
167
+
168
+ class C2f(nn.Module):
169
+ """CSP Bottleneck with 2 convolutions."""
170
+
171
+ def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
172
+ super().__init__()
173
+ self.c = int(c2 * e) # hidden channels
174
+ self.cv1 = Conv(c1, 2 * self.c, 1, 1)
175
+ self.cv2 = Conv((2 + n) * self.c, c2, 1) # optional act=FReLU(c2)
176
+ self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
177
+
178
+ def forward(self, x):
179
+ """Forward pass through C2f layer."""
180
+ y = list(self.cv1(x).chunk(2, 1))
181
+ y.extend(m(y[-1]) for m in self.m)
182
+ return self.cv2(torch.cat(y, 1))
183
+
184
+ def forward_split(self, x):
185
+ """Forward pass using split() instead of chunk()."""
186
+ y = list(self.cv1(x).split((self.c, self.c), 1))
187
+ y.extend(m(y[-1]) for m in self.m)
188
+ return self.cv2(torch.cat(y, 1))
189
+
190
+
191
+ class C3(nn.Module):
192
+ """CSP Bottleneck with 3 convolutions."""
193
+
194
+ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
195
+ super().__init__()
196
+ c_ = int(c2 * e) # hidden channels
197
+ self.cv1 = Conv(c1, c_, 1, 1)
198
+ self.cv2 = Conv(c1, c_, 1, 1)
199
+ self.cv3 = Conv(2 * c_, c2, 1) # optional act=FReLU(c2)
200
+ self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n)))
201
+
202
+ def forward(self, x):
203
+ """Forward pass through the CSP bottleneck with 2 convolutions."""
204
+ return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
205
+
206
+
207
+ class C3x(C3):
208
+ """C3 module with cross-convolutions."""
209
+
210
+ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
211
+ """Initialize C3TR instance and set default parameters."""
212
+ super().__init__(c1, c2, n, shortcut, g, e)
213
+ self.c_ = int(c2 * e)
214
+ self.m = nn.Sequential(*(Bottleneck(self.c_, self.c_, shortcut, g, k=((1, 3), (3, 1)), e=1) for _ in range(n)))
215
+
216
+
217
+ class RepC3(nn.Module):
218
+ """Rep C3."""
219
+
220
+ def __init__(self, c1, c2, n=3, e=1.0):
221
+ super().__init__()
222
+ c_ = int(c2 * e) # hidden channels
223
+ self.cv1 = Conv(c1, c2, 1, 1)
224
+ self.cv2 = Conv(c1, c2, 1, 1)
225
+ self.m = nn.Sequential(*[RepConv(c_, c_) for _ in range(n)])
226
+ self.cv3 = Conv(c_, c2, 1, 1) if c_ != c2 else nn.Identity()
227
+
228
+ def forward(self, x):
229
+ """Forward pass of RT-DETR neck layer."""
230
+ return self.cv3(self.m(self.cv1(x)) + self.cv2(x))
231
+
232
+
233
+ class C3TR(C3):
234
+ """C3 module with TransformerBlock()."""
235
+
236
+ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
237
+ """Initialize C3Ghost module with GhostBottleneck()."""
238
+ super().__init__(c1, c2, n, shortcut, g, e)
239
+ c_ = int(c2 * e)
240
+ self.m = TransformerBlock(c_, c_, 4, n)
241
+
242
+
243
+ class C3Ghost(C3):
244
+ """C3 module with GhostBottleneck()."""
245
+
246
+ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
247
+ """Initialize 'SPP' module with various pooling sizes for spatial pyramid pooling."""
248
+ super().__init__(c1, c2, n, shortcut, g, e)
249
+ c_ = int(c2 * e) # hidden channels
250
+ self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))
251
+
252
+
253
+ class GhostBottleneck(nn.Module):
254
+ """Ghost Bottleneck https://github.com/huawei-noah/ghostnet."""
255
+
256
+ def __init__(self, c1, c2, k=3, s=1): # ch_in, ch_out, kernel, stride
257
+ super().__init__()
258
+ c_ = c2 // 2
259
+ self.conv = nn.Sequential(
260
+ GhostConv(c1, c_, 1, 1), # pw
261
+ DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(), # dw
262
+ GhostConv(c_, c2, 1, 1, act=False)) # pw-linear
263
+ self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1,
264
+ act=False)) if s == 2 else nn.Identity()
265
+
266
+ def forward(self, x):
267
+ """Applies skip connection and concatenation to input tensor."""
268
+ return self.conv(x) + self.shortcut(x)
269
+
270
+
271
+ class Bottleneck(nn.Module):
272
+ """Standard bottleneck."""
273
+
274
+ def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5): # ch_in, ch_out, shortcut, groups, kernels, expand
275
+ super().__init__()
276
+ c_ = int(c2 * e) # hidden channels
277
+ self.cv1 = Conv(c1, c_, k[0], 1)
278
+ self.cv2 = Conv(c_, c2, k[1], 1, g=g)
279
+ self.add = shortcut and c1 == c2
280
+
281
+ def forward(self, x):
282
+ """'forward()' applies the YOLOv5 FPN to input data."""
283
+ return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
284
+
285
+
286
+ class BottleneckCSP(nn.Module):
287
+ """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""
288
+
289
+ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
290
+ super().__init__()
291
+ c_ = int(c2 * e) # hidden channels
292
+ self.cv1 = Conv(c1, c_, 1, 1)
293
+ self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
294
+ self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
295
+ self.cv4 = Conv(2 * c_, c2, 1, 1)
296
+ self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3)
297
+ self.act = nn.SiLU()
298
+ self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
299
+
300
+ def forward(self, x):
301
+ """Applies a CSP bottleneck with 3 convolutions."""
302
+ y1 = self.cv3(self.m(self.cv1(x)))
303
+ y2 = self.cv2(x)
304
+ return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))
modules/ultralytics/nn/modules/conv.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ """
3
+ Convolution modules
4
+ """
5
+
6
+ import math
7
+
8
+ import numpy as np
9
+ import torch
10
+ import torch.nn as nn
11
+
12
+ __all__ = ('Conv', 'LightConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus', 'GhostConv',
13
+ 'ChannelAttention', 'SpatialAttention', 'CBAM', 'Concat', 'RepConv')
14
+
15
+
16
+ def autopad(k, p=None, d=1): # kernel, padding, dilation
17
+ """Pad to 'same' shape outputs."""
18
+ if d > 1:
19
+ k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size
20
+ if p is None:
21
+ p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
22
+ return p
23
+
24
+
25
+ class Conv(nn.Module):
26
+ """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
27
+ default_act = nn.SiLU() # default activation
28
+
29
+ def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
30
+ """Initialize Conv layer with given arguments including activation."""
31
+ super().__init__()
32
+ self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
33
+ self.bn = nn.BatchNorm2d(c2)
34
+ self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
35
+
36
+ def forward(self, x):
37
+ """Apply convolution, batch normalization and activation to input tensor."""
38
+ return self.act(self.bn(self.conv(x)))
39
+
40
+ def forward_fuse(self, x):
41
+ """Perform transposed convolution of 2D data."""
42
+ return self.act(self.conv(x))
43
+
44
+
45
+ class Conv2(Conv):
46
+ """Simplified RepConv module with Conv fusing."""
47
+
48
+ def __init__(self, c1, c2, k=3, s=1, p=None, g=1, d=1, act=True):
49
+ """Initialize Conv layer with given arguments including activation."""
50
+ super().__init__(c1, c2, k, s, p, g=g, d=d, act=act)
51
+ self.cv2 = nn.Conv2d(c1, c2, 1, s, autopad(1, p, d), groups=g, dilation=d, bias=False) # add 1x1 conv
52
+
53
+ def forward(self, x):
54
+ """Apply convolution, batch normalization and activation to input tensor."""
55
+ return self.act(self.bn(self.conv(x) + self.cv2(x)))
56
+
57
+ def fuse_convs(self):
58
+ """Fuse parallel convolutions."""
59
+ w = torch.zeros_like(self.conv.weight.data)
60
+ i = [x // 2 for x in w.shape[2:]]
61
+ w[:, :, i[0]:i[0] + 1, i[1]:i[1] + 1] = self.cv2.weight.data.clone()
62
+ self.conv.weight.data += w
63
+ self.__delattr__('cv2')
64
+
65
+
66
+ class LightConv(nn.Module):
67
+ """Light convolution with args(ch_in, ch_out, kernel).
68
+ https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
69
+ """
70
+
71
+ def __init__(self, c1, c2, k=1, act=nn.ReLU()):
72
+ """Initialize Conv layer with given arguments including activation."""
73
+ super().__init__()
74
+ self.conv1 = Conv(c1, c2, 1, act=False)
75
+ self.conv2 = DWConv(c2, c2, k, act=act)
76
+
77
+ def forward(self, x):
78
+ """Apply 2 convolutions to input tensor."""
79
+ return self.conv2(self.conv1(x))
80
+
81
+
82
+ class DWConv(Conv):
83
+ """Depth-wise convolution."""
84
+
85
+ def __init__(self, c1, c2, k=1, s=1, d=1, act=True): # ch_in, ch_out, kernel, stride, dilation, activation
86
+ super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act)
87
+
88
+
89
+ class DWConvTranspose2d(nn.ConvTranspose2d):
90
+ """Depth-wise transpose convolution."""
91
+
92
+ def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0): # ch_in, ch_out, kernel, stride, padding, padding_out
93
+ super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2))
94
+
95
+
96
+ class ConvTranspose(nn.Module):
97
+ """Convolution transpose 2d layer."""
98
+ default_act = nn.SiLU() # default activation
99
+
100
+ def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
101
+ """Initialize ConvTranspose2d layer with batch normalization and activation function."""
102
+ super().__init__()
103
+ self.conv_transpose = nn.ConvTranspose2d(c1, c2, k, s, p, bias=not bn)
104
+ self.bn = nn.BatchNorm2d(c2) if bn else nn.Identity()
105
+ self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
106
+
107
+ def forward(self, x):
108
+ """Applies transposed convolutions, batch normalization and activation to input."""
109
+ return self.act(self.bn(self.conv_transpose(x)))
110
+
111
+ def forward_fuse(self, x):
112
+ """Applies activation and convolution transpose operation to input."""
113
+ return self.act(self.conv_transpose(x))
114
+
115
+
116
+ class Focus(nn.Module):
117
+ """Focus wh information into c-space."""
118
+
119
+ def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
120
+ super().__init__()
121
+ self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act)
122
+ # self.contract = Contract(gain=2)
123
+
124
+ def forward(self, x): # x(b,c,w,h) -> y(b,4c,w/2,h/2)
125
+ return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1))
126
+ # return self.conv(self.contract(x))
127
+
128
+
129
+ class GhostConv(nn.Module):
130
+ """Ghost Convolution https://github.com/huawei-noah/ghostnet."""
131
+
132
+ def __init__(self, c1, c2, k=1, s=1, g=1, act=True): # ch_in, ch_out, kernel, stride, groups
133
+ super().__init__()
134
+ c_ = c2 // 2 # hidden channels
135
+ self.cv1 = Conv(c1, c_, k, s, None, g, act=act)
136
+ self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act)
137
+
138
+ def forward(self, x):
139
+ """Forward propagation through a Ghost Bottleneck layer with skip connection."""
140
+ y = self.cv1(x)
141
+ return torch.cat((y, self.cv2(y)), 1)
142
+
143
+
144
+ class RepConv(nn.Module):
145
+ """RepConv is a basic rep-style block, including training and deploy status
146
+ This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
147
+ """
148
+ default_act = nn.SiLU() # default activation
149
+
150
+ def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False):
151
+ super().__init__()
152
+ assert k == 3 and p == 1
153
+ self.g = g
154
+ self.c1 = c1
155
+ self.c2 = c2
156
+ self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
157
+
158
+ self.bn = nn.BatchNorm2d(num_features=c1) if bn and c2 == c1 and s == 1 else None
159
+ self.conv1 = Conv(c1, c2, k, s, p=p, g=g, act=False)
160
+ self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False)
161
+
162
+ def forward_fuse(self, x):
163
+ """Forward process"""
164
+ return self.act(self.conv(x))
165
+
166
+ def forward(self, x):
167
+ """Forward process"""
168
+ id_out = 0 if self.bn is None else self.bn(x)
169
+ return self.act(self.conv1(x) + self.conv2(x) + id_out)
170
+
171
+ def get_equivalent_kernel_bias(self):
172
+ kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
173
+ kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
174
+ kernelid, biasid = self._fuse_bn_tensor(self.bn)
175
+ return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
176
+
177
+ def _avg_to_3x3_tensor(self, avgp):
178
+ channels = self.c1
179
+ groups = self.g
180
+ kernel_size = avgp.kernel_size
181
+ input_dim = channels // groups
182
+ k = torch.zeros((channels, input_dim, kernel_size, kernel_size))
183
+ k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2
184
+ return k
185
+
186
+ def _pad_1x1_to_3x3_tensor(self, kernel1x1):
187
+ if kernel1x1 is None:
188
+ return 0
189
+ else:
190
+ return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
191
+
192
+ def _fuse_bn_tensor(self, branch):
193
+ if branch is None:
194
+ return 0, 0
195
+ if isinstance(branch, Conv):
196
+ kernel = branch.conv.weight
197
+ running_mean = branch.bn.running_mean
198
+ running_var = branch.bn.running_var
199
+ gamma = branch.bn.weight
200
+ beta = branch.bn.bias
201
+ eps = branch.bn.eps
202
+ elif isinstance(branch, nn.BatchNorm2d):
203
+ if not hasattr(self, 'id_tensor'):
204
+ input_dim = self.c1 // self.g
205
+ kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32)
206
+ for i in range(self.c1):
207
+ kernel_value[i, i % input_dim, 1, 1] = 1
208
+ self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
209
+ kernel = self.id_tensor
210
+ running_mean = branch.running_mean
211
+ running_var = branch.running_var
212
+ gamma = branch.weight
213
+ beta = branch.bias
214
+ eps = branch.eps
215
+ std = (running_var + eps).sqrt()
216
+ t = (gamma / std).reshape(-1, 1, 1, 1)
217
+ return kernel * t, beta - running_mean * gamma / std
218
+
219
+ def fuse_convs(self):
220
+ if hasattr(self, 'conv'):
221
+ return
222
+ kernel, bias = self.get_equivalent_kernel_bias()
223
+ self.conv = nn.Conv2d(in_channels=self.conv1.conv.in_channels,
224
+ out_channels=self.conv1.conv.out_channels,
225
+ kernel_size=self.conv1.conv.kernel_size,
226
+ stride=self.conv1.conv.stride,
227
+ padding=self.conv1.conv.padding,
228
+ dilation=self.conv1.conv.dilation,
229
+ groups=self.conv1.conv.groups,
230
+ bias=True).requires_grad_(False)
231
+ self.conv.weight.data = kernel
232
+ self.conv.bias.data = bias
233
+ for para in self.parameters():
234
+ para.detach_()
235
+ self.__delattr__('conv1')
236
+ self.__delattr__('conv2')
237
+ if hasattr(self, 'nm'):
238
+ self.__delattr__('nm')
239
+ if hasattr(self, 'bn'):
240
+ self.__delattr__('bn')
241
+ if hasattr(self, 'id_tensor'):
242
+ self.__delattr__('id_tensor')
243
+
244
+
245
+ class ChannelAttention(nn.Module):
246
+ """Channel-attention module https://github.com/open-mmlab/mmdetection/tree/v3.0.0rc1/configs/rtmdet."""
247
+
248
+ def __init__(self, channels: int) -> None:
249
+ super().__init__()
250
+ self.pool = nn.AdaptiveAvgPool2d(1)
251
+ self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
252
+ self.act = nn.Sigmoid()
253
+
254
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
255
+ return x * self.act(self.fc(self.pool(x)))
256
+
257
+
258
+ class SpatialAttention(nn.Module):
259
+ """Spatial-attention module."""
260
+
261
+ def __init__(self, kernel_size=7):
262
+ """Initialize Spatial-attention module with kernel size argument."""
263
+ super().__init__()
264
+ assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
265
+ padding = 3 if kernel_size == 7 else 1
266
+ self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
267
+ self.act = nn.Sigmoid()
268
+
269
+ def forward(self, x):
270
+ """Apply channel and spatial attention on input for feature recalibration."""
271
+ return x * self.act(self.cv1(torch.cat([torch.mean(x, 1, keepdim=True), torch.max(x, 1, keepdim=True)[0]], 1)))
272
+
273
+
274
+ class CBAM(nn.Module):
275
+ """Convolutional Block Attention Module."""
276
+
277
+ def __init__(self, c1, kernel_size=7): # ch_in, kernels
278
+ super().__init__()
279
+ self.channel_attention = ChannelAttention(c1)
280
+ self.spatial_attention = SpatialAttention(kernel_size)
281
+
282
+ def forward(self, x):
283
+ """Applies the forward pass through C1 module."""
284
+ return self.spatial_attention(self.channel_attention(x))
285
+
286
+
287
+ class Concat(nn.Module):
288
+ """Concatenate a list of tensors along dimension."""
289
+
290
+ def __init__(self, dimension=1):
291
+ """Concatenates a list of tensors along a specified dimension."""
292
+ super().__init__()
293
+ self.d = dimension
294
+
295
+ def forward(self, x):
296
+ """Forward pass for the YOLOv8 mask Proto module."""
297
+ return torch.cat(x, self.d)
modules/ultralytics/nn/modules/head.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ """
3
+ Model head modules
4
+ """
5
+
6
+ import math
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ from torch.nn.init import constant_, xavier_uniform_
11
+
12
+ from ultralytics.yolo.utils.tal import dist2bbox, make_anchors
13
+
14
+ from .block import DFL, Proto
15
+ from .conv import Conv
16
+ from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
17
+ from .utils import bias_init_with_prob, linear_init_
18
+
19
+ __all__ = 'Detect', 'Segment', 'Pose', 'Classify', 'RTDETRDecoder'
20
+
21
+
22
+ class Detect(nn.Module):
23
+ """YOLOv8 Detect head for detection models."""
24
+ dynamic = False # force grid reconstruction
25
+ export = False # export mode
26
+ shape = None
27
+ anchors = torch.empty(0) # init
28
+ strides = torch.empty(0) # init
29
+
30
+ def __init__(self, nc=80, ch=()): # detection layer
31
+ super().__init__()
32
+ self.nc = nc # number of classes
33
+ self.nl = len(ch) # number of detection layers
34
+ self.reg_max = 26 # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
35
+ self.no = nc + self.reg_max * 4 # number of outputs per anchor
36
+ self.stride = torch.zeros(self.nl) # strides computed during build
37
+ c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], self.nc) # channels
38
+ self.cv2 = nn.ModuleList(
39
+ nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)
40
+ self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
41
+ self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
42
+
43
+ def forward(self, x):
44
+ """Concatenates and returns predicted bounding boxes and class probabilities."""
45
+ shape = x[0].shape # BCHW
46
+ for i in range(self.nl):
47
+ x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
48
+ if self.training:
49
+ return x
50
+ elif self.dynamic or self.shape != shape:
51
+ self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
52
+ self.shape = shape
53
+
54
+ x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
55
+ if self.export and self.format in ('saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs'): # avoid TF FlexSplitV ops
56
+ box = x_cat[:, :self.reg_max * 4]
57
+ cls = x_cat[:, self.reg_max * 4:]
58
+ else:
59
+ box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
60
+ dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
61
+ y = torch.cat((dbox, cls.sigmoid()), 1)
62
+ return y if self.export else (y, x)
63
+
64
+ def bias_init(self):
65
+ """Initialize Detect() biases, WARNING: requires stride availability."""
66
+ m = self # self.model[-1] # Detect() module
67
+ # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
68
+ # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum()) # nominal class frequency
69
+ for a, b, s in zip(m.cv2, m.cv3, m.stride): # from
70
+ a[-1].bias.data[:] = 1.0 # box
71
+ b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
72
+
73
+
74
+ class Segment(Detect):
75
+ """YOLOv8 Segment head for segmentation models."""
76
+
77
+ def __init__(self, nc=80, nm=32, npr=256, ch=()):
78
+ """Initialize the YOLO model attributes such as the number of masks, prototypes, and the convolution layers."""
79
+ super().__init__(nc, ch)
80
+ self.nm = nm # number of masks
81
+ self.npr = npr # number of protos
82
+ #self.proto = Proto(ch[0], self.npr, self.nm) # protos
83
+ self.detect = Detect.forward
84
+
85
+ c4 = max(ch[0] // 4, self.nm)
86
+ self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nm, 1)) for x in ch)
87
+
88
+ def forward(self, x):
89
+ """Return model outputs and mask coefficients if training, otherwise return outputs and mask coefficients."""
90
+ #p = self.proto(x[0]) # mask protos #mobilesamv2 change
91
+ p=0
92
+ # import pdb;pdb.set_trace()
93
+ bs = x[0].shape[0] # batch size
94
+
95
+ mc = torch.cat([self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 2) # mask coefficients
96
+ x = self.detect(self, x)
97
+ if self.training:
98
+ return x, mc, p
99
+ return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))
100
+
101
+
102
+ class Pose(Detect):
103
+ """YOLOv8 Pose head for keypoints models."""
104
+
105
+ def __init__(self, nc=80, kpt_shape=(17, 3), ch=()):
106
+ """Initialize YOLO network with default parameters and Convolutional Layers."""
107
+ super().__init__(nc, ch)
108
+ self.kpt_shape = kpt_shape # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
109
+ self.nk = kpt_shape[0] * kpt_shape[1] # number of keypoints total
110
+ self.detect = Detect.forward
111
+
112
+ c4 = max(ch[0] // 4, self.nk)
113
+ self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nk, 1)) for x in ch)
114
+
115
+ def forward(self, x):
116
+ """Perform forward pass through YOLO model and return predictions."""
117
+ bs = x[0].shape[0] # batch size
118
+ kpt = torch.cat([self.cv4[i](x[i]).view(bs, self.nk, -1) for i in range(self.nl)], -1) # (bs, 17*3, h*w)
119
+ x = self.detect(self, x)
120
+ if self.training:
121
+ return x, kpt
122
+ pred_kpt = self.kpts_decode(bs, kpt)
123
+ return torch.cat([x, pred_kpt], 1) if self.export else (torch.cat([x[0], pred_kpt], 1), (x[1], kpt))
124
+
125
+ def kpts_decode(self, bs, kpts):
126
+ """Decodes keypoints."""
127
+ ndim = self.kpt_shape[1]
128
+ if self.export: # required for TFLite export to avoid 'PLACEHOLDER_FOR_GREATER_OP_CODES' bug
129
+ y = kpts.view(bs, *self.kpt_shape, -1)
130
+ a = (y[:, :, :2] * 2.0 + (self.anchors - 0.5)) * self.strides
131
+ if ndim == 3:
132
+ a = torch.cat((a, y[:, :, 2:3].sigmoid()), 2)
133
+ return a.view(bs, self.nk, -1)
134
+ else:
135
+ y = kpts.clone()
136
+ if ndim == 3:
137
+ y[:, 2::3].sigmoid_() # inplace sigmoid
138
+ y[:, 0::ndim] = (y[:, 0::ndim] * 2.0 + (self.anchors[0] - 0.5)) * self.strides
139
+ y[:, 1::ndim] = (y[:, 1::ndim] * 2.0 + (self.anchors[1] - 0.5)) * self.strides
140
+ return y
141
+
142
+
143
+ class Classify(nn.Module):
144
+ """YOLOv8 classification head, i.e. x(b,c1,20,20) to x(b,c2)."""
145
+
146
+ def __init__(self, c1, c2, k=1, s=1, p=None, g=1): # ch_in, ch_out, kernel, stride, padding, groups
147
+ super().__init__()
148
+ c_ = 1280 # efficientnet_b0 size
149
+ self.conv = Conv(c1, c_, k, s, p, g)
150
+ self.pool = nn.AdaptiveAvgPool2d(1) # to x(b,c_,1,1)
151
+ self.drop = nn.Dropout(p=0.0, inplace=True)
152
+ self.linear = nn.Linear(c_, c2) # to x(b,c2)
153
+
154
+ def forward(self, x):
155
+ """Performs a forward pass of the YOLO model on input image data."""
156
+ if isinstance(x, list):
157
+ x = torch.cat(x, 1)
158
+ x = self.linear(self.drop(self.pool(self.conv(x)).flatten(1)))
159
+ return x if self.training else x.softmax(1)
160
+
161
+
162
+ class RTDETRDecoder(nn.Module):
163
+
164
+ def __init__(
165
+ self,
166
+ nc=80,
167
+ ch=(512, 1024, 2048),
168
+ hd=256, # hidden dim
169
+ nq=300, # num queries
170
+ ndp=4, # num decoder points
171
+ nh=8, # num head
172
+ ndl=6, # num decoder layers
173
+ d_ffn=1024, # dim of feedforward
174
+ dropout=0.,
175
+ act=nn.ReLU(),
176
+ eval_idx=-1,
177
+ # training args
178
+ nd=100, # num denoising
179
+ label_noise_ratio=0.5,
180
+ box_noise_scale=1.0,
181
+ learnt_init_query=False):
182
+ super().__init__()
183
+ self.hidden_dim = hd
184
+ self.nhead = nh
185
+ self.nl = len(ch) # num level
186
+ self.nc = nc
187
+ self.num_queries = nq
188
+ self.num_decoder_layers = ndl
189
+
190
+ # backbone feature projection
191
+ self.input_proj = nn.ModuleList(nn.Sequential(nn.Conv2d(x, hd, 1, bias=False), nn.BatchNorm2d(hd)) for x in ch)
192
+ # NOTE: simplified version but it's not consistent with .pt weights.
193
+ # self.input_proj = nn.ModuleList(Conv(x, hd, act=False) for x in ch)
194
+
195
+ # Transformer module
196
+ decoder_layer = DeformableTransformerDecoderLayer(hd, nh, d_ffn, dropout, act, self.nl, ndp)
197
+ self.decoder = DeformableTransformerDecoder(hd, decoder_layer, ndl, eval_idx)
198
+
199
+ # denoising part
200
+ self.denoising_class_embed = nn.Embedding(nc, hd)
201
+ self.num_denoising = nd
202
+ self.label_noise_ratio = label_noise_ratio
203
+ self.box_noise_scale = box_noise_scale
204
+
205
+ # decoder embedding
206
+ self.learnt_init_query = learnt_init_query
207
+ if learnt_init_query:
208
+ self.tgt_embed = nn.Embedding(nq, hd)
209
+ self.query_pos_head = MLP(4, 2 * hd, hd, num_layers=2)
210
+
211
+ # encoder head
212
+ self.enc_output = nn.Sequential(nn.Linear(hd, hd), nn.LayerNorm(hd))
213
+ self.enc_score_head = nn.Linear(hd, nc)
214
+ self.enc_bbox_head = MLP(hd, hd, 4, num_layers=3)
215
+
216
+ # decoder head
217
+ self.dec_score_head = nn.ModuleList([nn.Linear(hd, nc) for _ in range(ndl)])
218
+ self.dec_bbox_head = nn.ModuleList([MLP(hd, hd, 4, num_layers=3) for _ in range(ndl)])
219
+
220
+ self._reset_parameters()
221
+
222
+ def forward(self, x, batch=None):
223
+ from ultralytics.vit.utils.ops import get_cdn_group
224
+
225
+ # input projection and embedding
226
+ feats, shapes = self._get_encoder_input(x)
227
+
228
+ # prepare denoising training
229
+ dn_embed, dn_bbox, attn_mask, dn_meta = \
230
+ get_cdn_group(batch,
231
+ self.nc,
232
+ self.num_queries,
233
+ self.denoising_class_embed.weight,
234
+ self.num_denoising,
235
+ self.label_noise_ratio,
236
+ self.box_noise_scale,
237
+ self.training)
238
+
239
+ embed, refer_bbox, enc_bboxes, enc_scores = \
240
+ self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)
241
+
242
+ # decoder
243
+ dec_bboxes, dec_scores = self.decoder(embed,
244
+ refer_bbox,
245
+ feats,
246
+ shapes,
247
+ self.dec_bbox_head,
248
+ self.dec_score_head,
249
+ self.query_pos_head,
250
+ attn_mask=attn_mask)
251
+ if not self.training:
252
+ dec_scores = dec_scores.sigmoid_()
253
+ return dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta
254
+
255
+ def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2):
256
+ anchors = []
257
+ for i, (h, w) in enumerate(shapes):
258
+ grid_y, grid_x = torch.meshgrid(torch.arange(end=h, dtype=dtype, device=device),
259
+ torch.arange(end=w, dtype=dtype, device=device),
260
+ indexing='ij')
261
+ grid_xy = torch.stack([grid_x, grid_y], -1) # (h, w, 2)
262
+
263
+ valid_WH = torch.tensor([h, w], dtype=dtype, device=device)
264
+ grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH # (1, h, w, 2)
265
+ wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0 ** i)
266
+ anchors.append(torch.cat([grid_xy, wh], -1).view(-1, h * w, 4)) # (1, h*w, 4)
267
+
268
+ anchors = torch.cat(anchors, 1) # (1, h*w*nl, 4)
269
+ valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True) # 1, h*w*nl, 1
270
+ anchors = torch.log(anchors / (1 - anchors))
271
+ anchors = torch.where(valid_mask, anchors, torch.inf)
272
+ return anchors, valid_mask
273
+
274
+ def _get_encoder_input(self, x):
275
+ # get projection features
276
+ x = [self.input_proj[i](feat) for i, feat in enumerate(x)]
277
+ # get encoder inputs
278
+ feats = []
279
+ shapes = []
280
+ for feat in x:
281
+ h, w = feat.shape[2:]
282
+ # [b, c, h, w] -> [b, h*w, c]
283
+ feats.append(feat.flatten(2).permute(0, 2, 1))
284
+ # [nl, 2]
285
+ shapes.append([h, w])
286
+
287
+ # [b, h*w, c]
288
+ feats = torch.cat(feats, 1)
289
+ return feats, shapes
290
+
291
+ def _get_decoder_input(self, feats, shapes, dn_embed=None, dn_bbox=None):
292
+ bs = len(feats)
293
+ # prepare input for decoder
294
+ anchors, valid_mask = self._generate_anchors(shapes, dtype=feats.dtype, device=feats.device)
295
+ features = self.enc_output(torch.where(valid_mask, feats, 0)) # bs, h*w, 256
296
+
297
+ enc_outputs_scores = self.enc_score_head(features) # (bs, h*w, nc)
298
+ # dynamic anchors + static content
299
+ enc_outputs_bboxes = self.enc_bbox_head(features) + anchors # (bs, h*w, 4)
300
+
301
+ # query selection
302
+ # (bs, num_queries)
303
+ topk_ind = torch.topk(enc_outputs_scores.max(-1).values, self.num_queries, dim=1).indices.view(-1)
304
+ # (bs, num_queries)
305
+ batch_ind = torch.arange(end=bs, dtype=topk_ind.dtype).unsqueeze(-1).repeat(1, self.num_queries).view(-1)
306
+
307
+ # Unsigmoided
308
+ refer_bbox = enc_outputs_bboxes[batch_ind, topk_ind].view(bs, self.num_queries, -1)
309
+ # refer_bbox = torch.gather(enc_outputs_bboxes, 1, topk_ind.reshape(bs, self.num_queries).unsqueeze(-1).repeat(1, 1, 4))
310
+
311
+ enc_bboxes = refer_bbox.sigmoid()
312
+ if dn_bbox is not None:
313
+ refer_bbox = torch.cat([dn_bbox, refer_bbox], 1)
314
+ if self.training:
315
+ refer_bbox = refer_bbox.detach()
316
+ enc_scores = enc_outputs_scores[batch_ind, topk_ind].view(bs, self.num_queries, -1)
317
+
318
+ if self.learnt_init_query:
319
+ embeddings = self.tgt_embed.weight.unsqueeze(0).repeat(bs, 1, 1)
320
+ else:
321
+ embeddings = features[batch_ind, topk_ind].view(bs, self.num_queries, -1)
322
+ if self.training:
323
+ embeddings = embeddings.detach()
324
+ if dn_embed is not None:
325
+ embeddings = torch.cat([dn_embed, embeddings], 1)
326
+
327
+ return embeddings, refer_bbox, enc_bboxes, enc_scores
328
+
329
+ # TODO
330
+ def _reset_parameters(self):
331
+ # class and bbox head init
332
+ bias_cls = bias_init_with_prob(0.01) / 80 * self.nc
333
+ # NOTE: the weight initialization in `linear_init_` would cause NaN when training with custom datasets.
334
+ # linear_init_(self.enc_score_head)
335
+ constant_(self.enc_score_head.bias, bias_cls)
336
+ constant_(self.enc_bbox_head.layers[-1].weight, 0.)
337
+ constant_(self.enc_bbox_head.layers[-1].bias, 0.)
338
+ for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
339
+ # linear_init_(cls_)
340
+ constant_(cls_.bias, bias_cls)
341
+ constant_(reg_.layers[-1].weight, 0.)
342
+ constant_(reg_.layers[-1].bias, 0.)
343
+
344
+ linear_init_(self.enc_output[0])
345
+ xavier_uniform_(self.enc_output[0].weight)
346
+ if self.learnt_init_query:
347
+ xavier_uniform_(self.tgt_embed.weight)
348
+ xavier_uniform_(self.query_pos_head.layers[0].weight)
349
+ xavier_uniform_(self.query_pos_head.layers[1].weight)
350
+ for layer in self.input_proj:
351
+ xavier_uniform_(layer[0].weight)
modules/ultralytics/nn/modules/transformer.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ """
3
+ Transformer modules
4
+ """
5
+
6
+ import math
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+ from torch.nn.init import constant_, xavier_uniform_
12
+
13
+ from .conv import Conv
14
+ from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch
15
+
16
+ __all__ = ('TransformerEncoderLayer', 'TransformerLayer', 'TransformerBlock', 'MLPBlock', 'LayerNorm2d', 'AIFI',
17
+ 'DeformableTransformerDecoder', 'DeformableTransformerDecoderLayer', 'MSDeformAttn', 'MLP')
18
+
19
+
20
+ class TransformerEncoderLayer(nn.Module):
21
+ """Transformer Encoder."""
22
+
23
+ def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False):
24
+ super().__init__()
25
+ self.ma = nn.MultiheadAttention(c1, num_heads, dropout=dropout, batch_first=True)
26
+ # Implementation of Feedforward model
27
+ self.fc1 = nn.Linear(c1, cm)
28
+ self.fc2 = nn.Linear(cm, c1)
29
+
30
+ self.norm1 = nn.LayerNorm(c1)
31
+ self.norm2 = nn.LayerNorm(c1)
32
+ self.dropout = nn.Dropout(dropout)
33
+ self.dropout1 = nn.Dropout(dropout)
34
+ self.dropout2 = nn.Dropout(dropout)
35
+
36
+ self.act = act
37
+ self.normalize_before = normalize_before
38
+
39
+ def with_pos_embed(self, tensor, pos=None):
40
+ """Add position embeddings if given."""
41
+ return tensor if pos is None else tensor + pos
42
+
43
+ def forward_post(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
44
+ q = k = self.with_pos_embed(src, pos)
45
+ src2 = self.ma(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
46
+ src = src + self.dropout1(src2)
47
+ src = self.norm1(src)
48
+ src2 = self.fc2(self.dropout(self.act(self.fc1(src))))
49
+ src = src + self.dropout2(src2)
50
+ src = self.norm2(src)
51
+ return src
52
+
53
+ def forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
54
+ src2 = self.norm1(src)
55
+ q = k = self.with_pos_embed(src2, pos)
56
+ src2 = self.ma(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
57
+ src = src + self.dropout1(src2)
58
+ src2 = self.norm2(src)
59
+ src2 = self.fc2(self.dropout(self.act(self.fc1(src2))))
60
+ src = src + self.dropout2(src2)
61
+ return src
62
+
63
+ def forward(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
64
+ """Forward propagates the input through the encoder module."""
65
+ if self.normalize_before:
66
+ return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
67
+ return self.forward_post(src, src_mask, src_key_padding_mask, pos)
68
+
69
+
70
+ class AIFI(TransformerEncoderLayer):
71
+
72
+ def __init__(self, c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False):
73
+ super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
74
+
75
+ def forward(self, x):
76
+ c, h, w = x.shape[1:]
77
+ pos_embed = self.build_2d_sincos_position_embedding(w, h, c)
78
+ # flatten [B, C, H, W] to [B, HxW, C]
79
+ x = super().forward(x.flatten(2).permute(0, 2, 1), pos=pos_embed.to(device=x.device, dtype=x.dtype))
80
+ return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous()
81
+
82
+ @staticmethod
83
+ def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.):
84
+ grid_w = torch.arange(int(w), dtype=torch.float32)
85
+ grid_h = torch.arange(int(h), dtype=torch.float32)
86
+ grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij')
87
+ assert embed_dim % 4 == 0, \
88
+ 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
89
+ pos_dim = embed_dim // 4
90
+ omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
91
+ omega = 1. / (temperature ** omega)
92
+
93
+ out_w = grid_w.flatten()[..., None] @ omega[None]
94
+ out_h = grid_h.flatten()[..., None] @ omega[None]
95
+
96
+ return torch.concat([torch.sin(out_w), torch.cos(out_w),
97
+ torch.sin(out_h), torch.cos(out_h)], axis=1)[None, :, :]
98
+
99
+
100
+ class TransformerLayer(nn.Module):
101
+ """Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""
102
+
103
+ def __init__(self, c, num_heads):
104
+ """Initializes a self-attention mechanism using linear transformations and multi-head attention."""
105
+ super().__init__()
106
+ self.q = nn.Linear(c, c, bias=False)
107
+ self.k = nn.Linear(c, c, bias=False)
108
+ self.v = nn.Linear(c, c, bias=False)
109
+ self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)
110
+ self.fc1 = nn.Linear(c, c, bias=False)
111
+ self.fc2 = nn.Linear(c, c, bias=False)
112
+
113
+ def forward(self, x):
114
+ """Apply a transformer block to the input x and return the output."""
115
+ x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
116
+ x = self.fc2(self.fc1(x)) + x
117
+ return x
118
+
119
+
120
+ class TransformerBlock(nn.Module):
121
+ """Vision Transformer https://arxiv.org/abs/2010.11929."""
122
+
123
+ def __init__(self, c1, c2, num_heads, num_layers):
124
+ """Initialize a Transformer module with position embedding and specified number of heads and layers."""
125
+ super().__init__()
126
+ self.conv = None
127
+ if c1 != c2:
128
+ self.conv = Conv(c1, c2)
129
+ self.linear = nn.Linear(c2, c2) # learnable position embedding
130
+ self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))
131
+ self.c2 = c2
132
+
133
+ def forward(self, x):
134
+ """Forward propagates the input through the bottleneck module."""
135
+ if self.conv is not None:
136
+ x = self.conv(x)
137
+ b, _, w, h = x.shape
138
+ p = x.flatten(2).permute(2, 0, 1)
139
+ return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, w, h)
140
+
141
+
142
+ class MLPBlock(nn.Module):
143
+
144
+ def __init__(self, embedding_dim, mlp_dim, act=nn.GELU):
145
+ super().__init__()
146
+ self.lin1 = nn.Linear(embedding_dim, mlp_dim)
147
+ self.lin2 = nn.Linear(mlp_dim, embedding_dim)
148
+ self.act = act()
149
+
150
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
151
+ return self.lin2(self.act(self.lin1(x)))
152
+
153
+
154
+ class MLP(nn.Module):
155
+ """ Very simple multi-layer perceptron (also called FFN)"""
156
+
157
+ def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
158
+ super().__init__()
159
+ self.num_layers = num_layers
160
+ h = [hidden_dim] * (num_layers - 1)
161
+ self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
162
+
163
+ def forward(self, x):
164
+ for i, layer in enumerate(self.layers):
165
+ x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
166
+ return x
167
+
168
+
169
+ # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
170
+ # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa
171
+ class LayerNorm2d(nn.Module):
172
+
173
+ def __init__(self, num_channels, eps=1e-6):
174
+ super().__init__()
175
+ self.weight = nn.Parameter(torch.ones(num_channels))
176
+ self.bias = nn.Parameter(torch.zeros(num_channels))
177
+ self.eps = eps
178
+
179
+ def forward(self, x):
180
+ u = x.mean(1, keepdim=True)
181
+ s = (x - u).pow(2).mean(1, keepdim=True)
182
+ x = (x - u) / torch.sqrt(s + self.eps)
183
+ x = self.weight[:, None, None] * x + self.bias[:, None, None]
184
+ return x
185
+
186
+
187
+ class MSDeformAttn(nn.Module):
188
+ """
189
+ Original Multi-Scale Deformable Attention Module.
190
+ https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
191
+ """
192
+
193
+ def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
194
+ super().__init__()
195
+ if d_model % n_heads != 0:
196
+ raise ValueError(f'd_model must be divisible by n_heads, but got {d_model} and {n_heads}')
197
+ _d_per_head = d_model // n_heads
198
+ # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
199
+ assert _d_per_head * n_heads == d_model, '`d_model` must be divisible by `n_heads`'
200
+
201
+ self.im2col_step = 64
202
+
203
+ self.d_model = d_model
204
+ self.n_levels = n_levels
205
+ self.n_heads = n_heads
206
+ self.n_points = n_points
207
+
208
+ self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
209
+ self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
210
+ self.value_proj = nn.Linear(d_model, d_model)
211
+ self.output_proj = nn.Linear(d_model, d_model)
212
+
213
+ self._reset_parameters()
214
+
215
+ def _reset_parameters(self):
216
+ constant_(self.sampling_offsets.weight.data, 0.)
217
+ thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
218
+ grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
219
+ grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(
220
+ 1, self.n_levels, self.n_points, 1)
221
+ for i in range(self.n_points):
222
+ grid_init[:, :, i, :] *= i + 1
223
+ with torch.no_grad():
224
+ self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
225
+ constant_(self.attention_weights.weight.data, 0.)
226
+ constant_(self.attention_weights.bias.data, 0.)
227
+ xavier_uniform_(self.value_proj.weight.data)
228
+ constant_(self.value_proj.bias.data, 0.)
229
+ xavier_uniform_(self.output_proj.weight.data)
230
+ constant_(self.output_proj.bias.data, 0.)
231
+
232
+ def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
233
+ """
234
+ https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
235
+ Args:
236
+ query (torch.Tensor): [bs, query_length, C]
237
+ refer_bbox (torch.Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
238
+ bottom-right (1, 1), including padding area
239
+ value (torch.Tensor): [bs, value_length, C]
240
+ value_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
241
+ value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
242
+
243
+ Returns:
244
+ output (Tensor): [bs, Length_{query}, C]
245
+ """
246
+ bs, len_q = query.shape[:2]
247
+ len_v = value.shape[1]
248
+ assert sum(s[0] * s[1] for s in value_shapes) == len_v
249
+
250
+ value = self.value_proj(value)
251
+ if value_mask is not None:
252
+ value = value.masked_fill(value_mask[..., None], float(0))
253
+ value = value.view(bs, len_v, self.n_heads, self.d_model // self.n_heads)
254
+ sampling_offsets = self.sampling_offsets(query).view(bs, len_q, self.n_heads, self.n_levels, self.n_points, 2)
255
+ attention_weights = self.attention_weights(query).view(bs, len_q, self.n_heads, self.n_levels * self.n_points)
256
+ attention_weights = F.softmax(attention_weights, -1).view(bs, len_q, self.n_heads, self.n_levels, self.n_points)
257
+ # N, Len_q, n_heads, n_levels, n_points, 2
258
+ num_points = refer_bbox.shape[-1]
259
+ if num_points == 2:
260
+ offset_normalizer = torch.as_tensor(value_shapes, dtype=query.dtype, device=query.device).flip(-1)
261
+ add = sampling_offsets / offset_normalizer[None, None, None, :, None, :]
262
+ sampling_locations = refer_bbox[:, :, None, :, None, :] + add
263
+ elif num_points == 4:
264
+ add = sampling_offsets / self.n_points * refer_bbox[:, :, None, :, None, 2:] * 0.5
265
+ sampling_locations = refer_bbox[:, :, None, :, None, :2] + add
266
+ else:
267
+ raise ValueError(f'Last dim of reference_points must be 2 or 4, but got {num_points}.')
268
+ output = multi_scale_deformable_attn_pytorch(value, value_shapes, sampling_locations, attention_weights)
269
+ output = self.output_proj(output)
270
+ return output
271
+
272
+
273
+ class DeformableTransformerDecoderLayer(nn.Module):
274
+ """
275
+ https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
276
+ https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
277
+ """
278
+
279
+ def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0., act=nn.ReLU(), n_levels=4, n_points=4):
280
+ super().__init__()
281
+
282
+ # self attention
283
+ self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
284
+ self.dropout1 = nn.Dropout(dropout)
285
+ self.norm1 = nn.LayerNorm(d_model)
286
+
287
+ # cross attention
288
+ self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
289
+ self.dropout2 = nn.Dropout(dropout)
290
+ self.norm2 = nn.LayerNorm(d_model)
291
+
292
+ # ffn
293
+ self.linear1 = nn.Linear(d_model, d_ffn)
294
+ self.act = act
295
+ self.dropout3 = nn.Dropout(dropout)
296
+ self.linear2 = nn.Linear(d_ffn, d_model)
297
+ self.dropout4 = nn.Dropout(dropout)
298
+ self.norm3 = nn.LayerNorm(d_model)
299
+
300
+ @staticmethod
301
+ def with_pos_embed(tensor, pos):
302
+ return tensor if pos is None else tensor + pos
303
+
304
+ def forward_ffn(self, tgt):
305
+ tgt2 = self.linear2(self.dropout3(self.act(self.linear1(tgt))))
306
+ tgt = tgt + self.dropout4(tgt2)
307
+ tgt = self.norm3(tgt)
308
+ return tgt
309
+
310
+ def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None):
311
+ # self attention
312
+ q = k = self.with_pos_embed(embed, query_pos)
313
+ tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1),
314
+ attn_mask=attn_mask)[0].transpose(0, 1)
315
+ embed = embed + self.dropout1(tgt)
316
+ embed = self.norm1(embed)
317
+
318
+ # cross attention
319
+ tgt = self.cross_attn(self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes,
320
+ padding_mask)
321
+ embed = embed + self.dropout2(tgt)
322
+ embed = self.norm2(embed)
323
+
324
+ # ffn
325
+ embed = self.forward_ffn(embed)
326
+
327
+ return embed
328
+
329
+
330
+ class DeformableTransformerDecoder(nn.Module):
331
+ """
332
+ https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
333
+ """
334
+
335
+ def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
336
+ super().__init__()
337
+ self.layers = _get_clones(decoder_layer, num_layers)
338
+ self.num_layers = num_layers
339
+ self.hidden_dim = hidden_dim
340
+ self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
341
+
342
+ def forward(
343
+ self,
344
+ embed, # decoder embeddings
345
+ refer_bbox, # anchor
346
+ feats, # image features
347
+ shapes, # feature shapes
348
+ bbox_head,
349
+ score_head,
350
+ pos_mlp,
351
+ attn_mask=None,
352
+ padding_mask=None):
353
+ output = embed
354
+ dec_bboxes = []
355
+ dec_cls = []
356
+ last_refined_bbox = None
357
+ refer_bbox = refer_bbox.sigmoid()
358
+ for i, layer in enumerate(self.layers):
359
+ output = layer(output, refer_bbox, feats, shapes, padding_mask, attn_mask, pos_mlp(refer_bbox))
360
+
361
+ # refine bboxes, (bs, num_queries+num_denoising, 4)
362
+ refined_bbox = torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(refer_bbox))
363
+
364
+ if self.training:
365
+ dec_cls.append(score_head[i](output))
366
+ if i == 0:
367
+ dec_bboxes.append(refined_bbox)
368
+ else:
369
+ dec_bboxes.append(torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(last_refined_bbox)))
370
+ elif i == self.eval_idx:
371
+ dec_cls.append(score_head[i](output))
372
+ dec_bboxes.append(refined_bbox)
373
+ break
374
+
375
+ last_refined_bbox = refined_bbox
376
+ refer_bbox = refined_bbox.detach() if self.training else refined_bbox
377
+
378
+ return torch.stack(dec_bboxes), torch.stack(dec_cls)
modules/ultralytics/nn/modules/utils.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ """
3
+ Module utils
4
+ """
5
+
6
+ import copy
7
+ import math
8
+
9
+ import numpy as np
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ from torch.nn.init import uniform_
14
+
15
+ __all__ = 'multi_scale_deformable_attn_pytorch', 'inverse_sigmoid'
16
+
17
+
18
+ def _get_clones(module, n):
19
+ return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
20
+
21
+
22
+ def bias_init_with_prob(prior_prob=0.01):
23
+ """initialize conv/fc bias value according to a given probability value."""
24
+ return float(-np.log((1 - prior_prob) / prior_prob)) # return bias_init
25
+
26
+
27
+ def linear_init_(module):
28
+ bound = 1 / math.sqrt(module.weight.shape[0])
29
+ uniform_(module.weight, -bound, bound)
30
+ if hasattr(module, 'bias') and module.bias is not None:
31
+ uniform_(module.bias, -bound, bound)
32
+
33
+
34
+ def inverse_sigmoid(x, eps=1e-5):
35
+ x = x.clamp(min=0, max=1)
36
+ x1 = x.clamp(min=eps)
37
+ x2 = (1 - x).clamp(min=eps)
38
+ return torch.log(x1 / x2)
39
+
40
+
41
+ def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shapes: torch.Tensor,
42
+ sampling_locations: torch.Tensor,
43
+ attention_weights: torch.Tensor) -> torch.Tensor:
44
+ """
45
+ Multi-scale deformable attention.
46
+ https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py
47
+ """
48
+
49
+ bs, _, num_heads, embed_dims = value.shape
50
+ _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
51
+ value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
52
+ sampling_grids = 2 * sampling_locations - 1
53
+ sampling_value_list = []
54
+ for level, (H_, W_) in enumerate(value_spatial_shapes):
55
+ # bs, H_*W_, num_heads, embed_dims ->
56
+ # bs, H_*W_, num_heads*embed_dims ->
57
+ # bs, num_heads*embed_dims, H_*W_ ->
58
+ # bs*num_heads, embed_dims, H_, W_
59
+ value_l_ = (value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_))
60
+ # bs, num_queries, num_heads, num_points, 2 ->
61
+ # bs, num_heads, num_queries, num_points, 2 ->
62
+ # bs*num_heads, num_queries, num_points, 2
63
+ sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
64
+ # bs*num_heads, embed_dims, num_queries, num_points
65
+ sampling_value_l_ = F.grid_sample(value_l_,
66
+ sampling_grid_l_,
67
+ mode='bilinear',
68
+ padding_mode='zeros',
69
+ align_corners=False)
70
+ sampling_value_list.append(sampling_value_l_)
71
+ # (bs, num_queries, num_heads, num_levels, num_points) ->
72
+ # (bs, num_heads, num_queries, num_levels, num_points) ->
73
+ # (bs, num_heads, 1, num_queries, num_levels*num_points)
74
+ attention_weights = attention_weights.transpose(1, 2).reshape(bs * num_heads, 1, num_queries,
75
+ num_levels * num_points)
76
+ output = ((torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(
77
+ bs, num_heads * embed_dims, num_queries))
78
+ return output.transpose(1, 2).contiguous()
modules/ultralytics/nn/tasks.py ADDED
@@ -0,0 +1,780 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+
3
+ import contextlib
4
+ from copy import deepcopy
5
+ from pathlib import Path
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+
10
+ from ultralytics.nn.modules import (AIFI, C1, C2, C3, C3TR, SPP, SPPF, Bottleneck, BottleneckCSP, C2f, C3Ghost, C3x,
11
+ Classify, Concat, Conv, Conv2, ConvTranspose, Detect, DWConv, DWConvTranspose2d,
12
+ Focus, GhostBottleneck, GhostConv, HGBlock, HGStem, Pose, RepC3, RepConv,
13
+ RTDETRDecoder, Segment)
14
+ from ultralytics.yolo.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, colorstr, emojis, yaml_load
15
+ from ultralytics.yolo.utils.checks import check_requirements, check_suffix, check_yaml
16
+ from ultralytics.yolo.utils.loss import v8ClassificationLoss, v8DetectionLoss, v8PoseLoss, v8SegmentationLoss
17
+ from ultralytics.yolo.utils.plotting import feature_visualization
18
+ from ultralytics.yolo.utils.torch_utils import (fuse_conv_and_bn, fuse_deconv_and_bn, initialize_weights,
19
+ intersect_dicts, make_divisible, model_info, scale_img, time_sync)
20
+
21
+ try:
22
+ import thop
23
+ except ImportError:
24
+ thop = None
25
+
26
+
27
+ class BaseModel(nn.Module):
28
+ """
29
+ The BaseModel class serves as a base class for all the models in the Ultralytics YOLO family.
30
+ """
31
+
32
+ def forward(self, x, *args, **kwargs):
33
+ """
34
+ Forward pass of the model on a single scale.
35
+ Wrapper for `_forward_once` method.
36
+
37
+ Args:
38
+ x (torch.Tensor | dict): The input image tensor or a dict including image tensor and gt labels.
39
+
40
+ Returns:
41
+ (torch.Tensor): The output of the network.
42
+ """
43
+ if isinstance(x, dict): # for cases of training and validating while training.
44
+ return self.loss(x, *args, **kwargs)
45
+ return self.predict(x, *args, **kwargs)
46
+
47
+ def predict(self, x, profile=False, visualize=False, augment=False):
48
+ """
49
+ Perform a forward pass through the network.
50
+
51
+ Args:
52
+ x (torch.Tensor): The input tensor to the model.
53
+ profile (bool): Print the computation time of each layer if True, defaults to False.
54
+ visualize (bool): Save the feature maps of the model if True, defaults to False.
55
+ augment (bool): Augment image during prediction, defaults to False.
56
+
57
+ Returns:
58
+ (torch.Tensor): The last output of the model.
59
+ """
60
+ if augment:
61
+ return self._predict_augment(x)
62
+ return self._predict_once(x, profile, visualize)
63
+
64
+ def _predict_once(self, x, profile=False, visualize=False):
65
+ """
66
+ Perform a forward pass through the network.
67
+
68
+ Args:
69
+ x (torch.Tensor): The input tensor to the model.
70
+ profile (bool): Print the computation time of each layer if True, defaults to False.
71
+ visualize (bool): Save the feature maps of the model if True, defaults to False.
72
+
73
+ Returns:
74
+ (torch.Tensor): The last output of the model.
75
+ """
76
+ y, dt = [], [] # outputs
77
+ for m in self.model:
78
+ if m.f != -1: # if not from previous layer
79
+ x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers
80
+ if profile:
81
+ self._profile_one_layer(m, x, dt)
82
+ x = m(x) # run
83
+ y.append(x if m.i in self.save else None) # save output
84
+ if visualize:
85
+ feature_visualization(x, m.type, m.i, save_dir=visualize)
86
+ return x
87
+
88
+ def _predict_augment(self, x):
89
+ """Perform augmentations on input image x and return augmented inference."""
90
+ LOGGER.warning(
91
+ f'WARNING ⚠️ {self.__class__.__name__} has not supported augment inference yet! Now using single-scale inference instead.'
92
+ )
93
+ return self._predict_once(x)
94
+
95
+ def _profile_one_layer(self, m, x, dt):
96
+ """
97
+ Profile the computation time and FLOPs of a single layer of the model on a given input.
98
+ Appends the results to the provided list.
99
+
100
+ Args:
101
+ m (nn.Module): The layer to be profiled.
102
+ x (torch.Tensor): The input data to the layer.
103
+ dt (list): A list to store the computation time of the layer.
104
+
105
+ Returns:
106
+ None
107
+ """
108
+ c = m == self.model[-1] # is final layer, copy input as inplace fix
109
+ o = thop.profile(m, inputs=[x.clone() if c else x], verbose=False)[0] / 1E9 * 2 if thop else 0 # FLOPs
110
+ t = time_sync()
111
+ for _ in range(10):
112
+ m(x.clone() if c else x)
113
+ dt.append((time_sync() - t) * 100)
114
+ if m == self.model[0]:
115
+ LOGGER.info(f"{'time (ms)':>10s} {'GFLOPs':>10s} {'params':>10s} module")
116
+ LOGGER.info(f'{dt[-1]:10.2f} {o:10.2f} {m.np:10.0f} {m.type}')
117
+ if c:
118
+ LOGGER.info(f"{sum(dt):10.2f} {'-':>10s} {'-':>10s} Total")
119
+
120
+ def fuse(self, verbose=True):
121
+ """
122
+ Fuse the `Conv2d()` and `BatchNorm2d()` layers of the model into a single layer, in order to improve the
123
+ computation efficiency.
124
+
125
+ Returns:
126
+ (nn.Module): The fused model is returned.
127
+ """
128
+ if not self.is_fused():
129
+ for m in self.model.modules():
130
+ if isinstance(m, (Conv, Conv2, DWConv)) and hasattr(m, 'bn'):
131
+ if isinstance(m, Conv2):
132
+ m.fuse_convs()
133
+ m.conv = fuse_conv_and_bn(m.conv, m.bn) # update conv
134
+ delattr(m, 'bn') # remove batchnorm
135
+ m.forward = m.forward_fuse # update forward
136
+ if isinstance(m, ConvTranspose) and hasattr(m, 'bn'):
137
+ m.conv_transpose = fuse_deconv_and_bn(m.conv_transpose, m.bn)
138
+ delattr(m, 'bn') # remove batchnorm
139
+ m.forward = m.forward_fuse # update forward
140
+ if isinstance(m, RepConv):
141
+ m.fuse_convs()
142
+ m.forward = m.forward_fuse # update forward
143
+ self.info(verbose=verbose)
144
+
145
+ return self
146
+
147
+ def is_fused(self, thresh=10):
148
+ """
149
+ Check if the model has less than a certain threshold of BatchNorm layers.
150
+
151
+ Args:
152
+ thresh (int, optional): The threshold number of BatchNorm layers. Default is 10.
153
+
154
+ Returns:
155
+ (bool): True if the number of BatchNorm layers in the model is less than the threshold, False otherwise.
156
+ """
157
+ bn = tuple(v for k, v in nn.__dict__.items() if 'Norm' in k) # normalization layers, i.e. BatchNorm2d()
158
+ return sum(isinstance(v, bn) for v in self.modules()) < thresh # True if < 'thresh' BatchNorm layers in model
159
+
160
+ def info(self, detailed=False, verbose=True, imgsz=640):
161
+ """
162
+ Prints model information
163
+
164
+ Args:
165
+ verbose (bool): if True, prints out the model information. Defaults to False
166
+ imgsz (int): the size of the image that the model will be trained on. Defaults to 640
167
+ """
168
+ return model_info(self, detailed=detailed, verbose=verbose, imgsz=imgsz)
169
+
170
+ def _apply(self, fn):
171
+ """
172
+ `_apply()` is a function that applies a function to all the tensors in the model that are not
173
+ parameters or registered buffers
174
+
175
+ Args:
176
+ fn: the function to apply to the model
177
+
178
+ Returns:
179
+ A model that is a Detect() object.
180
+ """
181
+ self = super()._apply(fn)
182
+ m = self.model[-1] # Detect()
183
+ if isinstance(m, (Detect, Segment)):
184
+ m.stride = fn(m.stride)
185
+ m.anchors = fn(m.anchors)
186
+ m.strides = fn(m.strides)
187
+ return self
188
+
189
+ def load(self, weights, verbose=True):
190
+ """Load the weights into the model.
191
+
192
+ Args:
193
+ weights (dict | torch.nn.Module): The pre-trained weights to be loaded.
194
+ verbose (bool, optional): Whether to log the transfer progress. Defaults to True.
195
+ """
196
+ model = weights['model'] if isinstance(weights, dict) else weights # torchvision models are not dicts
197
+ csd = model.float().state_dict() # checkpoint state_dict as FP32
198
+ csd = intersect_dicts(csd, self.state_dict()) # intersect
199
+ self.load_state_dict(csd, strict=False) # load
200
+ if verbose:
201
+ LOGGER.info(f'Transferred {len(csd)}/{len(self.model.state_dict())} items from pretrained weights')
202
+
203
+ def loss(self, batch, preds=None):
204
+ """
205
+ Compute loss
206
+
207
+ Args:
208
+ batch (dict): Batch to compute loss on
209
+ preds (torch.Tensor | List[torch.Tensor]): Predictions.
210
+ """
211
+ if not hasattr(self, 'criterion'):
212
+ self.criterion = self.init_criterion()
213
+
214
+ preds = self.forward(batch['img']) if preds is None else preds
215
+ return self.criterion(preds, batch)
216
+
217
+ def init_criterion(self):
218
+ raise NotImplementedError('compute_loss() needs to be implemented by task heads')
219
+
220
+
221
+ class DetectionModel(BaseModel):
222
+ """YOLOv8 detection model."""
223
+
224
+ def __init__(self, cfg='yolov8n.yaml', ch=3, nc=None, verbose=True): # model, input channels, number of classes
225
+ super().__init__()
226
+ self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg) # cfg dict
227
+
228
+ # Define model
229
+ ch = self.yaml['ch'] = self.yaml.get('ch', ch) # input channels
230
+ if nc and nc != self.yaml['nc']:
231
+ LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
232
+ self.yaml['nc'] = nc # override yaml value
233
+ self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist
234
+ self.names = {i: f'{i}' for i in range(self.yaml['nc'])} # default names dict
235
+ self.inplace = self.yaml.get('inplace', True)
236
+
237
+ # Build strides
238
+ m = self.model[-1] # Detect()
239
+ if isinstance(m, (Detect, Segment, Pose)):
240
+ s = 256 # 2x min stride
241
+ m.inplace = self.inplace
242
+ forward = lambda x: self.forward(x)[0] if isinstance(m, (Segment, Pose)) else self.forward(x)
243
+ m.stride = torch.tensor([s / x.shape[-2] for x in forward(torch.zeros(1, ch, s, s))]) # forward
244
+ self.stride = m.stride
245
+ m.bias_init() # only run once
246
+ else:
247
+ self.stride = torch.Tensor([32]) # default stride for i.e. RTDETR
248
+
249
+ # Init weights, biases
250
+ initialize_weights(self)
251
+ if verbose:
252
+ self.info()
253
+ LOGGER.info('')
254
+
255
+ def _predict_augment(self, x):
256
+ """Perform augmentations on input image x and return augmented inference and train outputs."""
257
+ img_size = x.shape[-2:] # height, width
258
+ s = [1, 0.83, 0.67] # scales
259
+ f = [None, 3, None] # flips (2-ud, 3-lr)
260
+ y = [] # outputs
261
+ for si, fi in zip(s, f):
262
+ xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))
263
+ yi = super().predict(xi)[0] # forward
264
+ # cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1]) # save
265
+ yi = self._descale_pred(yi, fi, si, img_size)
266
+ y.append(yi)
267
+ y = self._clip_augmented(y) # clip augmented tails
268
+ return torch.cat(y, -1), None # augmented inference, train
269
+
270
+ @staticmethod
271
+ def _descale_pred(p, flips, scale, img_size, dim=1):
272
+ """De-scale predictions following augmented inference (inverse operation)."""
273
+ p[:, :4] /= scale # de-scale
274
+ x, y, wh, cls = p.split((1, 1, 2, p.shape[dim] - 4), dim)
275
+ if flips == 2:
276
+ y = img_size[0] - y # de-flip ud
277
+ elif flips == 3:
278
+ x = img_size[1] - x # de-flip lr
279
+ return torch.cat((x, y, wh, cls), dim)
280
+
281
+ def _clip_augmented(self, y):
282
+ """Clip YOLOv5 augmented inference tails."""
283
+ nl = self.model[-1].nl # number of detection layers (P3-P5)
284
+ g = sum(4 ** x for x in range(nl)) # grid points
285
+ e = 1 # exclude layer count
286
+ i = (y[0].shape[-1] // g) * sum(4 ** x for x in range(e)) # indices
287
+ y[0] = y[0][..., :-i] # large
288
+ i = (y[-1].shape[-1] // g) * sum(4 ** (nl - 1 - x) for x in range(e)) # indices
289
+ y[-1] = y[-1][..., i:] # small
290
+ return y
291
+
292
+ def init_criterion(self):
293
+ return v8DetectionLoss(self)
294
+
295
+
296
+ class SegmentationModel(DetectionModel):
297
+ """YOLOv8 segmentation model."""
298
+
299
+ def __init__(self, cfg='yolov8n-seg.yaml', ch=3, nc=None, verbose=True):
300
+ """Initialize YOLOv8 segmentation model with given config and parameters."""
301
+ super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
302
+
303
+ def init_criterion(self):
304
+ return v8SegmentationLoss(self)
305
+
306
+ def _predict_augment(self, x):
307
+ """Perform augmentations on input image x and return augmented inference."""
308
+ LOGGER.warning(
309
+ f'WARNING ⚠️ {self.__class__.__name__} has not supported augment inference yet! Now using single-scale inference instead.'
310
+ )
311
+ return self._predict_once(x)
312
+
313
+
314
+ class PoseModel(DetectionModel):
315
+ """YOLOv8 pose model."""
316
+
317
+ def __init__(self, cfg='yolov8n-pose.yaml', ch=3, nc=None, data_kpt_shape=(None, None), verbose=True):
318
+ """Initialize YOLOv8 Pose model."""
319
+ if not isinstance(cfg, dict):
320
+ cfg = yaml_model_load(cfg) # load model YAML
321
+ if any(data_kpt_shape) and list(data_kpt_shape) != list(cfg['kpt_shape']):
322
+ LOGGER.info(f"Overriding model.yaml kpt_shape={cfg['kpt_shape']} with kpt_shape={data_kpt_shape}")
323
+ cfg['kpt_shape'] = data_kpt_shape
324
+ super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
325
+
326
+ def init_criterion(self):
327
+ return v8PoseLoss(self)
328
+
329
+ def _predict_augment(self, x):
330
+ """Perform augmentations on input image x and return augmented inference."""
331
+ LOGGER.warning(
332
+ f'WARNING ⚠️ {self.__class__.__name__} has not supported augment inference yet! Now using single-scale inference instead.'
333
+ )
334
+ return self._predict_once(x)
335
+
336
+
337
+ class ClassificationModel(BaseModel):
338
+ """YOLOv8 classification model."""
339
+
340
+ def __init__(self,
341
+ cfg=None,
342
+ model=None,
343
+ ch=3,
344
+ nc=None,
345
+ cutoff=10,
346
+ verbose=True): # yaml, model, channels, number of classes, cutoff index, verbose flag
347
+ super().__init__()
348
+ self._from_detection_model(model, nc, cutoff) if model is not None else self._from_yaml(cfg, ch, nc, verbose)
349
+
350
+ def _from_detection_model(self, model, nc=1000, cutoff=10):
351
+ """Create a YOLOv5 classification model from a YOLOv5 detection model."""
352
+ from ultralytics.nn.autobackend import AutoBackend
353
+ if isinstance(model, AutoBackend):
354
+ model = model.model # unwrap DetectMultiBackend
355
+ model.model = model.model[:cutoff] # backbone
356
+ m = model.model[-1] # last layer
357
+ ch = m.conv.in_channels if hasattr(m, 'conv') else m.cv1.conv.in_channels # ch into module
358
+ c = Classify(ch, nc) # Classify()
359
+ c.i, c.f, c.type = m.i, m.f, 'models.common.Classify' # index, from, type
360
+ model.model[-1] = c # replace
361
+ self.model = model.model
362
+ self.stride = model.stride
363
+ self.save = []
364
+ self.nc = nc
365
+
366
+ def _from_yaml(self, cfg, ch, nc, verbose):
367
+ """Set YOLOv8 model configurations and define the model architecture."""
368
+ self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg) # cfg dict
369
+
370
+ # Define model
371
+ ch = self.yaml['ch'] = self.yaml.get('ch', ch) # input channels
372
+ if nc and nc != self.yaml['nc']:
373
+ LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
374
+ self.yaml['nc'] = nc # override yaml value
375
+ elif not nc and not self.yaml.get('nc', None):
376
+ raise ValueError('nc not specified. Must specify nc in model.yaml or function arguments.')
377
+ self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist
378
+ self.stride = torch.Tensor([1]) # no stride constraints
379
+ self.names = {i: f'{i}' for i in range(self.yaml['nc'])} # default names dict
380
+ self.info()
381
+
382
+ @staticmethod
383
+ def reshape_outputs(model, nc):
384
+ """Update a TorchVision classification model to class count 'n' if required."""
385
+ name, m = list((model.model if hasattr(model, 'model') else model).named_children())[-1] # last module
386
+ if isinstance(m, Classify): # YOLO Classify() head
387
+ if m.linear.out_features != nc:
388
+ m.linear = nn.Linear(m.linear.in_features, nc)
389
+ elif isinstance(m, nn.Linear): # ResNet, EfficientNet
390
+ if m.out_features != nc:
391
+ setattr(model, name, nn.Linear(m.in_features, nc))
392
+ elif isinstance(m, nn.Sequential):
393
+ types = [type(x) for x in m]
394
+ if nn.Linear in types:
395
+ i = types.index(nn.Linear) # nn.Linear index
396
+ if m[i].out_features != nc:
397
+ m[i] = nn.Linear(m[i].in_features, nc)
398
+ elif nn.Conv2d in types:
399
+ i = types.index(nn.Conv2d) # nn.Conv2d index
400
+ if m[i].out_channels != nc:
401
+ m[i] = nn.Conv2d(m[i].in_channels, nc, m[i].kernel_size, m[i].stride, bias=m[i].bias is not None)
402
+
403
+ def init_criterion(self):
404
+ """Compute the classification loss between predictions and true labels."""
405
+ return v8ClassificationLoss()
406
+
407
+
408
+ class RTDETRDetectionModel(DetectionModel):
409
+
410
+ def __init__(self, cfg='rtdetr-l.yaml', ch=3, nc=None, verbose=True):
411
+ super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
412
+
413
+ def init_criterion(self):
414
+ """Compute the classification loss between predictions and true labels."""
415
+ from ultralytics.vit.utils.loss import RTDETRDetectionLoss
416
+
417
+ return RTDETRDetectionLoss(nc=self.nc, use_vfl=True)
418
+
419
+ def loss(self, batch, preds=None):
420
+ if not hasattr(self, 'criterion'):
421
+ self.criterion = self.init_criterion()
422
+
423
+ img = batch['img']
424
+ # NOTE: preprocess gt_bbox and gt_labels to list.
425
+ bs = len(img)
426
+ batch_idx = batch['batch_idx']
427
+ gt_groups = [(batch_idx == i).sum().item() for i in range(bs)]
428
+ targets = {
429
+ 'cls': batch['cls'].to(img.device, dtype=torch.long).view(-1),
430
+ 'bboxes': batch['bboxes'].to(device=img.device),
431
+ 'batch_idx': batch_idx.to(img.device, dtype=torch.long).view(-1),
432
+ 'gt_groups': gt_groups}
433
+
434
+ preds = self.predict(img, batch=targets) if preds is None else preds
435
+ dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta = preds
436
+ if dn_meta is None:
437
+ dn_bboxes, dn_scores = None, None
438
+ else:
439
+ dn_bboxes, dec_bboxes = torch.split(dec_bboxes, dn_meta['dn_num_split'], dim=2)
440
+ dn_scores, dec_scores = torch.split(dec_scores, dn_meta['dn_num_split'], dim=2)
441
+
442
+ dec_bboxes = torch.cat([enc_bboxes.unsqueeze(0), dec_bboxes]) # (7, bs, 300, 4)
443
+ dec_scores = torch.cat([enc_scores.unsqueeze(0), dec_scores])
444
+
445
+ loss = self.criterion((dec_bboxes, dec_scores),
446
+ targets,
447
+ dn_bboxes=dn_bboxes,
448
+ dn_scores=dn_scores,
449
+ dn_meta=dn_meta)
450
+ # NOTE: There are like 12 losses in RTDETR, backward with all losses but only show the main three losses.
451
+ return sum(loss.values()), torch.as_tensor([loss[k].detach() for k in ['loss_giou', 'loss_class', 'loss_bbox']],
452
+ device=img.device)
453
+
454
+ def predict(self, x, profile=False, visualize=False, batch=None, augment=False):
455
+ """
456
+ Perform a forward pass through the network.
457
+
458
+ Args:
459
+ x (torch.Tensor): The input tensor to the model
460
+ profile (bool): Print the computation time of each layer if True, defaults to False.
461
+ visualize (bool): Save the feature maps of the model if True, defaults to False
462
+ batch (dict): A dict including gt boxes and labels from dataloader.
463
+
464
+ Returns:
465
+ (torch.Tensor): The last output of the model.
466
+ """
467
+ y, dt = [], [] # outputs
468
+ for m in self.model[:-1]: # except the head part
469
+ if m.f != -1: # if not from previous layer
470
+ x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers
471
+ if profile:
472
+ self._profile_one_layer(m, x, dt)
473
+ x = m(x) # run
474
+ y.append(x if m.i in self.save else None) # save output
475
+ if visualize:
476
+ feature_visualization(x, m.type, m.i, save_dir=visualize)
477
+ head = self.model[-1]
478
+ x = head([y[j] for j in head.f], batch) # head inference
479
+ return x
480
+
481
+
482
+ class Ensemble(nn.ModuleList):
483
+ """Ensemble of models."""
484
+
485
+ def __init__(self):
486
+ """Initialize an ensemble of models."""
487
+ super().__init__()
488
+
489
+ def forward(self, x, augment=False, profile=False, visualize=False):
490
+ """Function generates the YOLOv5 network's final layer."""
491
+ y = [module(x, augment, profile, visualize)[0] for module in self]
492
+ # y = torch.stack(y).max(0)[0] # max ensemble
493
+ # y = torch.stack(y).mean(0) # mean ensemble
494
+ y = torch.cat(y, 2) # nms ensemble, y shape(B, HW, C)
495
+ return y, None # inference, train output
496
+
497
+
498
+ # Functions ------------------------------------------------------------------------------------------------------------
499
+
500
+
501
+ def torch_safe_load(weight):
502
+ """
503
+ This function attempts to load a PyTorch model with the torch.load() function. If a ModuleNotFoundError is raised,
504
+ it catches the error, logs a warning message, and attempts to install the missing module via the
505
+ check_requirements() function. After installation, the function again attempts to load the model using torch.load().
506
+
507
+ Args:
508
+ weight (str): The file path of the PyTorch model.
509
+
510
+ Returns:
511
+ (dict): The loaded PyTorch model.
512
+ """
513
+ from ultralytics.yolo.utils.downloads import attempt_download_asset
514
+
515
+ check_suffix(file=weight, suffix='.pt')
516
+ file = attempt_download_asset(weight) # search online if missing locally
517
+ try:
518
+ return torch.load(file, map_location='cpu'), file # load
519
+ except ModuleNotFoundError as e: # e.name is missing module name
520
+ if e.name == 'models':
521
+ raise TypeError(
522
+ emojis(f'ERROR ❌️ {weight} appears to be an Ultralytics YOLOv5 model originally trained '
523
+ f'with https://github.com/ultralytics/yolov5.\nThis model is NOT forwards compatible with '
524
+ f'YOLOv8 at https://github.com/ultralytics/ultralytics.'
525
+ f"\nRecommend fixes are to train a new model using the latest 'ultralytics' package or to "
526
+ f"run a command with an official YOLOv8 model, i.e. 'yolo predict model=yolov8n.pt'")) from e
527
+ LOGGER.warning(f"WARNING ⚠️ {weight} appears to require '{e.name}', which is not in ultralytics requirements."
528
+ f"\nAutoInstall will run now for '{e.name}' but this feature will be removed in the future."
529
+ f"\nRecommend fixes are to train a new model using the latest 'ultralytics' package or to "
530
+ f"run a command with an official YOLOv8 model, i.e. 'yolo predict model=yolov8n.pt'")
531
+ check_requirements(e.name) # install missing module
532
+
533
+ return torch.load(file, map_location='cpu'), file # load
534
+
535
+
536
+ def attempt_load_weights(weights, device=None, inplace=True, fuse=False):
537
+ """Loads an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a."""
538
+
539
+ ensemble = Ensemble()
540
+ for w in weights if isinstance(weights, list) else [weights]:
541
+ ckpt, w = torch_safe_load(w) # load ckpt
542
+ args = {**DEFAULT_CFG_DICT, **ckpt['train_args']} if 'train_args' in ckpt else None # combined args
543
+ model = (ckpt.get('ema') or ckpt['model']).to(device).float() # FP32 model
544
+
545
+ # Model compatibility updates
546
+ model.args = args # attach args to model
547
+ model.pt_path = w # attach *.pt file path to model
548
+ model.task = guess_model_task(model)
549
+ if not hasattr(model, 'stride'):
550
+ model.stride = torch.tensor([32.])
551
+
552
+ # Append
553
+ ensemble.append(model.fuse().eval() if fuse and hasattr(model, 'fuse') else model.eval()) # model in eval mode
554
+
555
+ # Module compatibility updates
556
+ for m in ensemble.modules():
557
+ t = type(m)
558
+ if t in (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU, Detect, Segment):
559
+ m.inplace = inplace # torch 1.7.0 compatibility
560
+ elif t is nn.Upsample and not hasattr(m, 'recompute_scale_factor'):
561
+ m.recompute_scale_factor = None # torch 1.11.0 compatibility
562
+
563
+ # Return model
564
+ if len(ensemble) == 1:
565
+ return ensemble[-1]
566
+
567
+ # Return ensemble
568
+ LOGGER.info(f'Ensemble created with {weights}\n')
569
+ for k in 'names', 'nc', 'yaml':
570
+ setattr(ensemble, k, getattr(ensemble[0], k))
571
+ ensemble.stride = ensemble[torch.argmax(torch.tensor([m.stride.max() for m in ensemble])).int()].stride
572
+ assert all(ensemble[0].nc == m.nc for m in ensemble), f'Models differ in class counts {[m.nc for m in ensemble]}'
573
+ return ensemble
574
+
575
+
576
+ def attempt_load_one_weight(weight, device=None, inplace=True, fuse=False):
577
+ """Loads a single model weights."""
578
+ ckpt, weight = torch_safe_load(weight) # load ckpt
579
+ args = {**DEFAULT_CFG_DICT, **(ckpt.get('train_args', {}))} # combine model and default args, preferring model args
580
+ model = (ckpt.get('ema') or ckpt['model']).to(device).float() # FP32 model
581
+
582
+ # Model compatibility updates
583
+ model.args = {k: v for k, v in args.items() if k in DEFAULT_CFG_KEYS} # attach args to model
584
+ model.pt_path = weight # attach *.pt file path to model
585
+ model.task = guess_model_task(model)
586
+ if not hasattr(model, 'stride'):
587
+ model.stride = torch.tensor([32.])
588
+
589
+ model = model.fuse().eval() if fuse and hasattr(model, 'fuse') else model.eval() # model in eval mode
590
+
591
+ # Module compatibility updates
592
+ for m in model.modules():
593
+ t = type(m)
594
+ if t in (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU, Detect, Segment):
595
+ m.inplace = inplace # torch 1.7.0 compatibility
596
+ elif t is nn.Upsample and not hasattr(m, 'recompute_scale_factor'):
597
+ m.recompute_scale_factor = None # torch 1.11.0 compatibility
598
+
599
+ # Return model and ckpt
600
+ return model, ckpt
601
+
602
+
603
+ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
604
+ # Parse a YOLO model.yaml dictionary into a PyTorch model
605
+ import ast
606
+
607
+ # Args
608
+ max_channels = float('inf')
609
+ nc, act, scales = (d.get(x) for x in ('nc', 'activation', 'scales'))
610
+ depth, width, kpt_shape = (d.get(x, 1.0) for x in ('depth_multiple', 'width_multiple', 'kpt_shape'))
611
+ if scales:
612
+ scale = d.get('scale')
613
+ if not scale:
614
+ scale = tuple(scales.keys())[0]
615
+ LOGGER.warning(f"WARNING ⚠️ no model scale passed. Assuming scale='{scale}'.")
616
+ depth, width, max_channels = scales[scale]
617
+
618
+ if act:
619
+ Conv.default_act = eval(act) # redefine default activation, i.e. Conv.default_act = nn.SiLU()
620
+ if verbose:
621
+ LOGGER.info(f"{colorstr('activation:')} {act}") # print
622
+
623
+ if verbose:
624
+ LOGGER.info(f"\n{'':>3}{'from':>20}{'n':>3}{'params':>10} {'module':<45}{'arguments':<30}")
625
+ ch = [ch]
626
+ layers, save, c2 = [], [], ch[-1] # layers, savelist, ch out
627
+ for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']): # from, number, module, args
628
+ m = getattr(torch.nn, m[3:]) if 'nn.' in m else globals()[m] # get module
629
+ for j, a in enumerate(args):
630
+ if isinstance(a, str):
631
+ with contextlib.suppress(ValueError):
632
+ args[j] = locals()[a] if a in locals() else ast.literal_eval(a)
633
+
634
+ n = n_ = max(round(n * depth), 1) if n > 1 else n # depth gain
635
+ if m in (Classify, Conv, ConvTranspose, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, Focus,
636
+ BottleneckCSP, C1, C2, C2f, C3, C3TR, C3Ghost, nn.ConvTranspose2d, DWConvTranspose2d, C3x, RepC3):
637
+ c1, c2 = ch[f], args[0]
638
+ if c2 != nc: # if c2 not equal to number of classes (i.e. for Classify() output)
639
+ c2 = make_divisible(min(c2, max_channels) * width, 8)
640
+
641
+ args = [c1, c2, *args[1:]]
642
+ if m in (BottleneckCSP, C1, C2, C2f, C3, C3TR, C3Ghost, C3x, RepC3):
643
+ args.insert(2, n) # number of repeats
644
+ n = 1
645
+ elif m is AIFI:
646
+ args = [ch[f], *args]
647
+ elif m in (HGStem, HGBlock):
648
+ c1, cm, c2 = ch[f], args[0], args[1]
649
+ args = [c1, cm, c2, *args[2:]]
650
+ if m is HGBlock:
651
+ args.insert(4, n) # number of repeats
652
+ n = 1
653
+
654
+ elif m is nn.BatchNorm2d:
655
+ args = [ch[f]]
656
+ elif m is Concat:
657
+ c2 = sum(ch[x] for x in f)
658
+ elif m in (Detect, Segment, Pose, RTDETRDecoder):
659
+ args.append([ch[x] for x in f])
660
+ if m is Segment:
661
+ args[2] = make_divisible(min(args[2], max_channels) * width, 8)
662
+ else:
663
+ c2 = ch[f]
664
+
665
+ m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args) # module
666
+ t = str(m)[8:-2].replace('__main__.', '') # module type
667
+ m.np = sum(x.numel() for x in m_.parameters()) # number params
668
+ m_.i, m_.f, m_.type = i, f, t # attach index, 'from' index, type
669
+ if verbose:
670
+ LOGGER.info(f'{i:>3}{str(f):>20}{n_:>3}{m.np:10.0f} {t:<45}{str(args):<30}') # print
671
+ save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist
672
+ layers.append(m_)
673
+ if i == 0:
674
+ ch = []
675
+ ch.append(c2)
676
+ return nn.Sequential(*layers), sorted(save)
677
+
678
+
679
+ def yaml_model_load(path):
680
+ """Load a YOLOv8 model from a YAML file."""
681
+ import re
682
+
683
+ path = Path(path)
684
+ if path.stem in (f'yolov{d}{x}6' for x in 'nsmlx' for d in (5, 8)):
685
+ new_stem = re.sub(r'(\d+)([nslmx])6(.+)?$', r'\1\2-p6\3', path.stem)
686
+ LOGGER.warning(f'WARNING ⚠️ Ultralytics YOLO P6 models now use -p6 suffix. Renaming {path.stem} to {new_stem}.')
687
+ path = path.with_stem(new_stem)
688
+
689
+ unified_path = re.sub(r'(\d+)([nslmx])(.+)?$', r'\1\3', str(path)) # i.e. yolov8x.yaml -> yolov8.yaml
690
+ yaml_file = check_yaml(unified_path, hard=False) or check_yaml(path)
691
+ d = yaml_load(yaml_file) # model dict
692
+ d['scale'] = guess_model_scale(path)
693
+ d['yaml_file'] = str(path)
694
+ return d
695
+
696
+
697
+ def guess_model_scale(model_path):
698
+ """
699
+ Takes a path to a YOLO model's YAML file as input and extracts the size character of the model's scale.
700
+ The function uses regular expression matching to find the pattern of the model scale in the YAML file name,
701
+ which is denoted by n, s, m, l, or x. The function returns the size character of the model scale as a string.
702
+
703
+ Args:
704
+ model_path (str | Path): The path to the YOLO model's YAML file.
705
+
706
+ Returns:
707
+ (str): The size character of the model's scale, which can be n, s, m, l, or x.
708
+ """
709
+ with contextlib.suppress(AttributeError):
710
+ import re
711
+ return re.search(r'yolov\d+([nslmx])', Path(model_path).stem).group(1) # n, s, m, l, or x
712
+ return ''
713
+
714
+
715
+ def guess_model_task(model):
716
+ """
717
+ Guess the task of a PyTorch model from its architecture or configuration.
718
+
719
+ Args:
720
+ model (nn.Module | dict): PyTorch model or model configuration in YAML format.
721
+
722
+ Returns:
723
+ (str): Task of the model ('detect', 'segment', 'classify', 'pose').
724
+
725
+ Raises:
726
+ SyntaxError: If the task of the model could not be determined.
727
+ """
728
+
729
+ def cfg2task(cfg):
730
+ """Guess from YAML dictionary."""
731
+ m = cfg['head'][-1][-2].lower() # output module name
732
+ if m in ('classify', 'classifier', 'cls', 'fc'):
733
+ return 'classify'
734
+ if m == 'detect':
735
+ return 'detect'
736
+ if m == 'segment':
737
+ return 'segment'
738
+ if m == 'pose':
739
+ return 'pose'
740
+
741
+ # Guess from model cfg
742
+ if isinstance(model, dict):
743
+ with contextlib.suppress(Exception):
744
+ return cfg2task(model)
745
+
746
+ # Guess from PyTorch model
747
+ if isinstance(model, nn.Module): # PyTorch model
748
+ for x in 'model.args', 'model.model.args', 'model.model.model.args':
749
+ with contextlib.suppress(Exception):
750
+ return eval(x)['task']
751
+ for x in 'model.yaml', 'model.model.yaml', 'model.model.model.yaml':
752
+ with contextlib.suppress(Exception):
753
+ return cfg2task(eval(x))
754
+
755
+ for m in model.modules():
756
+ if isinstance(m, Detect):
757
+ return 'detect'
758
+ elif isinstance(m, Segment):
759
+ return 'segment'
760
+ elif isinstance(m, Classify):
761
+ return 'classify'
762
+ elif isinstance(m, Pose):
763
+ return 'pose'
764
+
765
+ # Guess from model filename
766
+ if isinstance(model, (str, Path)):
767
+ model = Path(model)
768
+ if '-seg' in model.stem or 'segment' in model.parts:
769
+ return 'segment'
770
+ elif '-cls' in model.stem or 'classify' in model.parts:
771
+ return 'classify'
772
+ elif '-pose' in model.stem or 'pose' in model.parts:
773
+ return 'pose'
774
+ elif 'detect' in model.parts:
775
+ return 'detect'
776
+
777
+ # Unable to determine task from model
778
+ LOGGER.warning("WARNING ⚠️ Unable to automatically guess model task, assuming 'task=detect'. "
779
+ "Explicitly define task for your model, i.e. 'task=detect', 'segment', 'classify', or 'pose'.")
780
+ return 'detect' # assume detect
modules/ultralytics/tracker/README.md ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tracker
2
+
3
+ ## Supported Trackers
4
+
5
+ - [x] ByteTracker
6
+ - [x] BoT-SORT
7
+
8
+ ## Usage
9
+
10
+ ### python interface:
11
+
12
+ You can use the Python interface to track objects using the YOLO model.
13
+
14
+ ```python
15
+ from ultralytics import YOLO
16
+
17
+ model = YOLO("yolov8n.pt") # or a segmentation model .i.e yolov8n-seg.pt
18
+ model.track(
19
+ source="video/streams",
20
+ stream=True,
21
+ tracker="botsort.yaml", # or 'bytetrack.yaml'
22
+ show=True,
23
+ )
24
+ ```
25
+
26
+ You can get the IDs of the tracked objects using the following code:
27
+
28
+ ```python
29
+ from ultralytics import YOLO
30
+
31
+ model = YOLO("yolov8n.pt")
32
+
33
+ for result in model.track(source="video.mp4"):
34
+ print(
35
+ result.boxes.id.cpu().numpy().astype(int)
36
+ ) # this will print the IDs of the tracked objects in the frame
37
+ ```
38
+
39
+ If you want to use the tracker with a folder of images or when you loop on the video frames, you should use the `persist` parameter to tell the model that these frames are related to each other so the IDs will be fixed for the same objects. Otherwise, the IDs will be different in each frame because in each loop, the model creates a new object for tracking, but the `persist` parameter makes it use the same object for tracking.
40
+
41
+ ```python
42
+ import cv2
43
+ from ultralytics import YOLO
44
+
45
+ cap = cv2.VideoCapture("video.mp4")
46
+ model = YOLO("yolov8n.pt")
47
+ while True:
48
+ ret, frame = cap.read()
49
+ if not ret:
50
+ break
51
+ results = model.track(frame, persist=True)
52
+ boxes = results[0].boxes.xyxy.cpu().numpy().astype(int)
53
+ ids = results[0].boxes.id.cpu().numpy().astype(int)
54
+ for box, id in zip(boxes, ids):
55
+ cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
56
+ cv2.putText(
57
+ frame,
58
+ f"Id {id}",
59
+ (box[0], box[1]),
60
+ cv2.FONT_HERSHEY_SIMPLEX,
61
+ 1,
62
+ (0, 0, 255),
63
+ 2,
64
+ )
65
+ cv2.imshow("frame", frame)
66
+ if cv2.waitKey(1) & 0xFF == ord("q"):
67
+ break
68
+ ```
69
+
70
+ ## Change tracker parameters
71
+
72
+ You can change the tracker parameters by eding the `tracker.yaml` file which is located in the ultralytics/tracker/cfg folder.
73
+
74
+ ## Command Line Interface (CLI)
75
+
76
+ You can also use the command line interface to track objects using the YOLO model.
77
+
78
+ ```bash
79
+ yolo detect track source=... tracker=...
80
+ yolo segment track source=... tracker=...
81
+ yolo pose track source=... tracker=...
82
+ ```
83
+
84
+ By default, trackers will use the configuration in `ultralytics/tracker/cfg`.
85
+ We also support using a modified tracker config file. Please refer to the tracker config files
86
+ in `ultralytics/tracker/cfg`.<br>