geetu040 commited on
Commit
b91dbc2
·
1 Parent(s): a138d14

update config since changes in model architecture

Browse files
Files changed (3) hide show
  1. README.md +4 -4
  2. assets/tiger.jpg +0 -0
  3. config.json +130 -19
README.md CHANGED
@@ -8,7 +8,7 @@ pipeline_tag: depth-estimation
8
  Install the required libraries:
9
  ```bash
10
  pip install -q numpy pillow torch torchvision
11
- pip install -q git+https://github.com/geetu040/transformers.git@depth-pro-projects#egg=transformers
12
  ```
13
 
14
  Import the required libraries:
@@ -22,14 +22,14 @@ from huggingface_hub import hf_hub_download
22
  import matplotlib.pyplot as plt
23
 
24
  # custom installation from this PR: https://github.com/huggingface/transformers/pull/34583
25
- # !pip install git+https://github.com/geetu040/transformers.git@depth-pro-projects#egg=transformers
26
  from transformers import DepthProConfig, DepthProImageProcessorFast, DepthProForDepthEstimation
27
  ```
28
 
29
  Load the model and image processor:
30
  ```py
31
  checkpoint = "geetu040/DepthPro"
32
- revision = "project"
33
  image_processor = DepthProImageProcessorFast.from_pretrained(checkpoint, revision=revision)
34
  model = DepthProForDepthEstimation.from_pretrained(checkpoint, revision=revision)
35
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -40,7 +40,7 @@ Inference:
40
  ```py
41
  # inference
42
 
43
- url = "https://huggingface.co/spaces/geetu040/DepthPro_Segmentation_Human/resolve/main/assets/examples/man_with_arms_open.jpg"
44
 
45
  image = Image.open(requests.get(url, stream=True).raw)
46
  image = image.convert("RGB")
 
8
  Install the required libraries:
9
  ```bash
10
  pip install -q numpy pillow torch torchvision
11
+ pip install -q git+https://github.com/geetu040/transformers.git@depth-pro#egg=transformers
12
  ```
13
 
14
  Import the required libraries:
 
22
  import matplotlib.pyplot as plt
23
 
24
  # custom installation from this PR: https://github.com/huggingface/transformers/pull/34583
25
+ # !pip install git+https://github.com/geetu040/transformers.git@depth-pro#egg=transformers
26
  from transformers import DepthProConfig, DepthProImageProcessorFast, DepthProForDepthEstimation
27
  ```
28
 
29
  Load the model and image processor:
30
  ```py
31
  checkpoint = "geetu040/DepthPro"
32
+ revision = "main"
33
  image_processor = DepthProImageProcessorFast.from_pretrained(checkpoint, revision=revision)
34
  model = DepthProForDepthEstimation.from_pretrained(checkpoint, revision=revision)
35
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
40
  ```py
41
  # inference
42
 
43
+ url = "https://huggingface.co/geetu040/DepthPro/resolve/main/assets/tiger.jpg"
44
 
45
  image = Image.open(requests.get(url, stream=True).raw)
46
  image = image.convert("RGB")
assets/tiger.jpg ADDED
config.json CHANGED
@@ -1,14 +1,92 @@
1
  {
2
- "apply_layernorm": true,
3
  "architectures": [
4
  "DepthProForDepthEstimation"
5
  ],
6
- "attention_probs_dropout_prob": 0.0,
7
- "drop_path_rate": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "fusion_hidden_size": 256,
9
- "hidden_act": "gelu",
10
- "hidden_dropout_prob": 0.0,
11
- "hidden_size": 1024,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  "initializer_range": 0.02,
13
  "intermediate_feature_dims": [
14
  256,
@@ -18,18 +96,52 @@
18
  11,
19
  5
20
  ],
21
- "layer_norm_eps": 1e-06,
22
- "layerscale_value": 1.0,
23
- "mlp_ratio": 4,
24
  "model_type": "depth_pro",
25
- "num_attention_heads": 16,
26
- "num_channels": 3,
27
  "num_fov_head_layers": 2,
28
- "num_hidden_layers": 24,
29
- "patch_embeddings_size": 16,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  "patch_size": 384,
31
- "qkv_bias": true,
32
- "reshape_hidden_states": true,
33
  "scaled_images_feature_dims": [
34
  1024,
35
  1024,
@@ -45,10 +157,9 @@
45
  0.5,
46
  1
47
  ],
48
- "torch_dtype": "float32",
49
- "transformers_version": "4.48.0.dev0",
50
  "use_batch_norm_in_fusion_residual": false,
51
  "use_bias_in_fusion_residual": true,
52
- "use_fov_model": true,
53
- "use_swiglu_ffn": false
54
  }
 
1
  {
 
2
  "architectures": [
3
  "DepthProForDepthEstimation"
4
  ],
5
+ "fov_model_config": {
6
+ "hidden_size": 1024,
7
+ "image_size": 384,
8
+ "model_type": "dinov2",
9
+ "num_attention_heads": 16,
10
+ "num_hidden_layers": 24,
11
+ "out_features": [
12
+ "stage24"
13
+ ],
14
+ "out_indices": [
15
+ 24
16
+ ],
17
+ "patch_size": 16,
18
+ "stage_names": [
19
+ "stem",
20
+ "stage1",
21
+ "stage2",
22
+ "stage3",
23
+ "stage4",
24
+ "stage5",
25
+ "stage6",
26
+ "stage7",
27
+ "stage8",
28
+ "stage9",
29
+ "stage10",
30
+ "stage11",
31
+ "stage12",
32
+ "stage13",
33
+ "stage14",
34
+ "stage15",
35
+ "stage16",
36
+ "stage17",
37
+ "stage18",
38
+ "stage19",
39
+ "stage20",
40
+ "stage21",
41
+ "stage22",
42
+ "stage23",
43
+ "stage24"
44
+ ],
45
+ "use_mask_token": false
46
+ },
47
  "fusion_hidden_size": 256,
48
+ "image_model_config": {
49
+ "hidden_size": 1024,
50
+ "image_size": 384,
51
+ "model_type": "dinov2",
52
+ "num_attention_heads": 16,
53
+ "num_hidden_layers": 24,
54
+ "out_features": [
55
+ "stage24"
56
+ ],
57
+ "out_indices": [
58
+ 24
59
+ ],
60
+ "patch_size": 16,
61
+ "stage_names": [
62
+ "stem",
63
+ "stage1",
64
+ "stage2",
65
+ "stage3",
66
+ "stage4",
67
+ "stage5",
68
+ "stage6",
69
+ "stage7",
70
+ "stage8",
71
+ "stage9",
72
+ "stage10",
73
+ "stage11",
74
+ "stage12",
75
+ "stage13",
76
+ "stage14",
77
+ "stage15",
78
+ "stage16",
79
+ "stage17",
80
+ "stage18",
81
+ "stage19",
82
+ "stage20",
83
+ "stage21",
84
+ "stage22",
85
+ "stage23",
86
+ "stage24"
87
+ ],
88
+ "use_mask_token": false
89
+ },
90
  "initializer_range": 0.02,
91
  "intermediate_feature_dims": [
92
  256,
 
96
  11,
97
  5
98
  ],
99
+ "merge_padding_value": 3,
 
 
100
  "model_type": "depth_pro",
 
 
101
  "num_fov_head_layers": 2,
102
+ "patch_model_config": {
103
+ "hidden_size": 1024,
104
+ "image_size": 384,
105
+ "model_type": "dinov2",
106
+ "num_attention_heads": 16,
107
+ "num_hidden_layers": 24,
108
+ "out_features": [
109
+ "stage24"
110
+ ],
111
+ "out_indices": [
112
+ 24
113
+ ],
114
+ "patch_size": 16,
115
+ "stage_names": [
116
+ "stem",
117
+ "stage1",
118
+ "stage2",
119
+ "stage3",
120
+ "stage4",
121
+ "stage5",
122
+ "stage6",
123
+ "stage7",
124
+ "stage8",
125
+ "stage9",
126
+ "stage10",
127
+ "stage11",
128
+ "stage12",
129
+ "stage13",
130
+ "stage14",
131
+ "stage15",
132
+ "stage16",
133
+ "stage17",
134
+ "stage18",
135
+ "stage19",
136
+ "stage20",
137
+ "stage21",
138
+ "stage22",
139
+ "stage23",
140
+ "stage24"
141
+ ],
142
+ "use_mask_token": false
143
+ },
144
  "patch_size": 384,
 
 
145
  "scaled_images_feature_dims": [
146
  1024,
147
  1024,
 
157
  0.5,
158
  1
159
  ],
160
+ "torch_dtype": "float16",
161
+ "transformers_version": "4.49.0.dev0",
162
  "use_batch_norm_in_fusion_residual": false,
163
  "use_bias_in_fusion_residual": true,
164
+ "use_fov_model": true
 
165
  }