chenz53 commited on
Commit
99aa071
1 Parent(s): 09beebc

Upload 3 files

Browse files
Files changed (3) hide show
  1. config.json +30 -0
  2. model.safetensors +3 -0
  3. model_architecture.txt +68 -0
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "VideoMAEForPreTraining"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "attn_implementation": "flash_attention_2",
7
+ "decoder_hidden_size": 384,
8
+ "decoder_intermediate_size": 1536,
9
+ "decoder_num_attention_heads": 6,
10
+ "decoder_num_hidden_layers": 4,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.0,
13
+ "hidden_size": 768,
14
+ "image_size": 384,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 3072,
17
+ "layer_norm_eps": 1e-12,
18
+ "model_type": "videomae",
19
+ "norm_pix_loss": true,
20
+ "num_attention_heads": 12,
21
+ "num_channels": 1,
22
+ "num_frames": 320,
23
+ "num_hidden_layers": 12,
24
+ "patch_size": 16,
25
+ "qkv_bias": true,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.46.1",
28
+ "tubelet_size": 16,
29
+ "use_mean_pooling": true
30
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84744ee6437115d109c7d239f907f7503c17cc1a532c3e3b5c7bf08b72c69493
3
+ size 388674152
model_architecture.txt ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ VideoMAEForPreTraining(
2
+ (videomae): VideoMAEModel(
3
+ (embeddings): VideoMAEEmbeddings(
4
+ (patch_embeddings): VideoMAEPatchEmbeddings(
5
+ (projection): Conv3d(1, 768, kernel_size=(16, 16, 16), stride=(16, 16, 16))
6
+ )
7
+ )
8
+ (encoder): VideoMAEEncoder(
9
+ (layer): ModuleList(
10
+ (0-11): 12 x VideoMAELayer(
11
+ (attention): VideoMAESdpaAttention(
12
+ (attention): VideoMAESdpaSelfAttention(
13
+ (query): Linear(in_features=768, out_features=768, bias=False)
14
+ (key): Linear(in_features=768, out_features=768, bias=False)
15
+ (value): Linear(in_features=768, out_features=768, bias=False)
16
+ (dropout): Dropout(p=0.0, inplace=False)
17
+ )
18
+ (output): VideoMAESelfOutput(
19
+ (dense): Linear(in_features=768, out_features=768, bias=True)
20
+ (dropout): Dropout(p=0.0, inplace=False)
21
+ )
22
+ )
23
+ (intermediate): VideoMAEIntermediate(
24
+ (dense): Linear(in_features=768, out_features=3072, bias=True)
25
+ (intermediate_act_fn): GELUActivation()
26
+ )
27
+ (output): VideoMAEOutput(
28
+ (dense): Linear(in_features=3072, out_features=768, bias=True)
29
+ (dropout): Dropout(p=0.0, inplace=False)
30
+ )
31
+ (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
32
+ (layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
33
+ )
34
+ )
35
+ )
36
+ )
37
+ (encoder_to_decoder): Linear(in_features=768, out_features=384, bias=False)
38
+ (decoder): VideoMAEDecoder(
39
+ (decoder_layers): ModuleList(
40
+ (0-3): 4 x VideoMAELayer(
41
+ (attention): VideoMAESdpaAttention(
42
+ (attention): VideoMAESdpaSelfAttention(
43
+ (query): Linear(in_features=384, out_features=384, bias=False)
44
+ (key): Linear(in_features=384, out_features=384, bias=False)
45
+ (value): Linear(in_features=384, out_features=384, bias=False)
46
+ (dropout): Dropout(p=0.0, inplace=False)
47
+ )
48
+ (output): VideoMAESelfOutput(
49
+ (dense): Linear(in_features=384, out_features=384, bias=True)
50
+ (dropout): Dropout(p=0.0, inplace=False)
51
+ )
52
+ )
53
+ (intermediate): VideoMAEIntermediate(
54
+ (dense): Linear(in_features=384, out_features=1536, bias=True)
55
+ (intermediate_act_fn): GELUActivation()
56
+ )
57
+ (output): VideoMAEOutput(
58
+ (dense): Linear(in_features=1536, out_features=384, bias=True)
59
+ (dropout): Dropout(p=0.0, inplace=False)
60
+ )
61
+ (layernorm_before): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
62
+ (layernorm_after): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
63
+ )
64
+ )
65
+ (norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
66
+ (head): Linear(in_features=384, out_features=4096, bias=True)
67
+ )
68
+ )