update config since changes in model architecture
Browse files- README.md +4 -4
- assets/tiger.jpg +0 -0
- config.json +130 -19
README.md
CHANGED
@@ -8,7 +8,7 @@ pipeline_tag: depth-estimation
|
|
8 |
Install the required libraries:
|
9 |
```bash
|
10 |
pip install -q numpy pillow torch torchvision
|
11 |
-
pip install -q git+https://github.com/geetu040/transformers.git@depth-pro
|
12 |
```
|
13 |
|
14 |
Import the required libraries:
|
@@ -22,14 +22,14 @@ from huggingface_hub import hf_hub_download
|
|
22 |
import matplotlib.pyplot as plt
|
23 |
|
24 |
# custom installation from this PR: https://github.com/huggingface/transformers/pull/34583
|
25 |
-
# !pip install git+https://github.com/geetu040/transformers.git@depth-pro
|
26 |
from transformers import DepthProConfig, DepthProImageProcessorFast, DepthProForDepthEstimation
|
27 |
```
|
28 |
|
29 |
Load the model and image processor:
|
30 |
```py
|
31 |
checkpoint = "geetu040/DepthPro"
|
32 |
-
revision = "
|
33 |
image_processor = DepthProImageProcessorFast.from_pretrained(checkpoint, revision=revision)
|
34 |
model = DepthProForDepthEstimation.from_pretrained(checkpoint, revision=revision)
|
35 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
@@ -40,7 +40,7 @@ Inference:
|
|
40 |
```py
|
41 |
# inference
|
42 |
|
43 |
-
url = "https://huggingface.co/
|
44 |
|
45 |
image = Image.open(requests.get(url, stream=True).raw)
|
46 |
image = image.convert("RGB")
|
|
|
8 |
Install the required libraries:
|
9 |
```bash
|
10 |
pip install -q numpy pillow torch torchvision
|
11 |
+
pip install -q git+https://github.com/geetu040/transformers.git@depth-pro#egg=transformers
|
12 |
```
|
13 |
|
14 |
Import the required libraries:
|
|
|
22 |
import matplotlib.pyplot as plt
|
23 |
|
24 |
# custom installation from this PR: https://github.com/huggingface/transformers/pull/34583
|
25 |
+
# !pip install git+https://github.com/geetu040/transformers.git@depth-pro#egg=transformers
|
26 |
from transformers import DepthProConfig, DepthProImageProcessorFast, DepthProForDepthEstimation
|
27 |
```
|
28 |
|
29 |
Load the model and image processor:
|
30 |
```py
|
31 |
checkpoint = "geetu040/DepthPro"
|
32 |
+
revision = "main"
|
33 |
image_processor = DepthProImageProcessorFast.from_pretrained(checkpoint, revision=revision)
|
34 |
model = DepthProForDepthEstimation.from_pretrained(checkpoint, revision=revision)
|
35 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
40 |
```py
|
41 |
# inference
|
42 |
|
43 |
+
url = "https://huggingface.co/geetu040/DepthPro/resolve/main/assets/tiger.jpg"
|
44 |
|
45 |
image = Image.open(requests.get(url, stream=True).raw)
|
46 |
image = image.convert("RGB")
|
assets/tiger.jpg
ADDED
config.json
CHANGED
@@ -1,14 +1,92 @@
|
|
1 |
{
|
2 |
-
"apply_layernorm": true,
|
3 |
"architectures": [
|
4 |
"DepthProForDepthEstimation"
|
5 |
],
|
6 |
-
"
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
"fusion_hidden_size": 256,
|
9 |
-
"
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
"initializer_range": 0.02,
|
13 |
"intermediate_feature_dims": [
|
14 |
256,
|
@@ -18,18 +96,52 @@
|
|
18 |
11,
|
19 |
5
|
20 |
],
|
21 |
-
"
|
22 |
-
"layerscale_value": 1.0,
|
23 |
-
"mlp_ratio": 4,
|
24 |
"model_type": "depth_pro",
|
25 |
-
"num_attention_heads": 16,
|
26 |
-
"num_channels": 3,
|
27 |
"num_fov_head_layers": 2,
|
28 |
-
"
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
"patch_size": 384,
|
31 |
-
"qkv_bias": true,
|
32 |
-
"reshape_hidden_states": true,
|
33 |
"scaled_images_feature_dims": [
|
34 |
1024,
|
35 |
1024,
|
@@ -45,10 +157,9 @@
|
|
45 |
0.5,
|
46 |
1
|
47 |
],
|
48 |
-
"torch_dtype": "
|
49 |
-
"transformers_version": "4.
|
50 |
"use_batch_norm_in_fusion_residual": false,
|
51 |
"use_bias_in_fusion_residual": true,
|
52 |
-
"use_fov_model": true
|
53 |
-
"use_swiglu_ffn": false
|
54 |
}
|
|
|
1 |
{
|
|
|
2 |
"architectures": [
|
3 |
"DepthProForDepthEstimation"
|
4 |
],
|
5 |
+
"fov_model_config": {
|
6 |
+
"hidden_size": 1024,
|
7 |
+
"image_size": 384,
|
8 |
+
"model_type": "dinov2",
|
9 |
+
"num_attention_heads": 16,
|
10 |
+
"num_hidden_layers": 24,
|
11 |
+
"out_features": [
|
12 |
+
"stage24"
|
13 |
+
],
|
14 |
+
"out_indices": [
|
15 |
+
24
|
16 |
+
],
|
17 |
+
"patch_size": 16,
|
18 |
+
"stage_names": [
|
19 |
+
"stem",
|
20 |
+
"stage1",
|
21 |
+
"stage2",
|
22 |
+
"stage3",
|
23 |
+
"stage4",
|
24 |
+
"stage5",
|
25 |
+
"stage6",
|
26 |
+
"stage7",
|
27 |
+
"stage8",
|
28 |
+
"stage9",
|
29 |
+
"stage10",
|
30 |
+
"stage11",
|
31 |
+
"stage12",
|
32 |
+
"stage13",
|
33 |
+
"stage14",
|
34 |
+
"stage15",
|
35 |
+
"stage16",
|
36 |
+
"stage17",
|
37 |
+
"stage18",
|
38 |
+
"stage19",
|
39 |
+
"stage20",
|
40 |
+
"stage21",
|
41 |
+
"stage22",
|
42 |
+
"stage23",
|
43 |
+
"stage24"
|
44 |
+
],
|
45 |
+
"use_mask_token": false
|
46 |
+
},
|
47 |
"fusion_hidden_size": 256,
|
48 |
+
"image_model_config": {
|
49 |
+
"hidden_size": 1024,
|
50 |
+
"image_size": 384,
|
51 |
+
"model_type": "dinov2",
|
52 |
+
"num_attention_heads": 16,
|
53 |
+
"num_hidden_layers": 24,
|
54 |
+
"out_features": [
|
55 |
+
"stage24"
|
56 |
+
],
|
57 |
+
"out_indices": [
|
58 |
+
24
|
59 |
+
],
|
60 |
+
"patch_size": 16,
|
61 |
+
"stage_names": [
|
62 |
+
"stem",
|
63 |
+
"stage1",
|
64 |
+
"stage2",
|
65 |
+
"stage3",
|
66 |
+
"stage4",
|
67 |
+
"stage5",
|
68 |
+
"stage6",
|
69 |
+
"stage7",
|
70 |
+
"stage8",
|
71 |
+
"stage9",
|
72 |
+
"stage10",
|
73 |
+
"stage11",
|
74 |
+
"stage12",
|
75 |
+
"stage13",
|
76 |
+
"stage14",
|
77 |
+
"stage15",
|
78 |
+
"stage16",
|
79 |
+
"stage17",
|
80 |
+
"stage18",
|
81 |
+
"stage19",
|
82 |
+
"stage20",
|
83 |
+
"stage21",
|
84 |
+
"stage22",
|
85 |
+
"stage23",
|
86 |
+
"stage24"
|
87 |
+
],
|
88 |
+
"use_mask_token": false
|
89 |
+
},
|
90 |
"initializer_range": 0.02,
|
91 |
"intermediate_feature_dims": [
|
92 |
256,
|
|
|
96 |
11,
|
97 |
5
|
98 |
],
|
99 |
+
"merge_padding_value": 3,
|
|
|
|
|
100 |
"model_type": "depth_pro",
|
|
|
|
|
101 |
"num_fov_head_layers": 2,
|
102 |
+
"patch_model_config": {
|
103 |
+
"hidden_size": 1024,
|
104 |
+
"image_size": 384,
|
105 |
+
"model_type": "dinov2",
|
106 |
+
"num_attention_heads": 16,
|
107 |
+
"num_hidden_layers": 24,
|
108 |
+
"out_features": [
|
109 |
+
"stage24"
|
110 |
+
],
|
111 |
+
"out_indices": [
|
112 |
+
24
|
113 |
+
],
|
114 |
+
"patch_size": 16,
|
115 |
+
"stage_names": [
|
116 |
+
"stem",
|
117 |
+
"stage1",
|
118 |
+
"stage2",
|
119 |
+
"stage3",
|
120 |
+
"stage4",
|
121 |
+
"stage5",
|
122 |
+
"stage6",
|
123 |
+
"stage7",
|
124 |
+
"stage8",
|
125 |
+
"stage9",
|
126 |
+
"stage10",
|
127 |
+
"stage11",
|
128 |
+
"stage12",
|
129 |
+
"stage13",
|
130 |
+
"stage14",
|
131 |
+
"stage15",
|
132 |
+
"stage16",
|
133 |
+
"stage17",
|
134 |
+
"stage18",
|
135 |
+
"stage19",
|
136 |
+
"stage20",
|
137 |
+
"stage21",
|
138 |
+
"stage22",
|
139 |
+
"stage23",
|
140 |
+
"stage24"
|
141 |
+
],
|
142 |
+
"use_mask_token": false
|
143 |
+
},
|
144 |
"patch_size": 384,
|
|
|
|
|
145 |
"scaled_images_feature_dims": [
|
146 |
1024,
|
147 |
1024,
|
|
|
157 |
0.5,
|
158 |
1
|
159 |
],
|
160 |
+
"torch_dtype": "float16",
|
161 |
+
"transformers_version": "4.49.0.dev0",
|
162 |
"use_batch_norm_in_fusion_residual": false,
|
163 |
"use_bias_in_fusion_residual": true,
|
164 |
+
"use_fov_model": true
|
|
|
165 |
}
|