Night-Quiet commited on
Commit
5e0c4e5
·
1 Parent(s): 42ed155
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.json filter=lfs diff=lfs merge=lfs -text
37
+ *.jsonl filter=lfs diff=lfs merge=lfs -text
gaudio/audio_encoder.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75759b2e87f6e8974f86d1d275c6a9d1c96f2a300b32e2c88ee5180331ff558a
3
+ size 884939345
gaudio/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de7152eec4e7d795e9c2855fa971dcb22af49fdab01bd2177c41bb6ae5c64fdf
3
+ size 2585
gaudio/preprocessor_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba6754d6c8d51e343d5cd431549cc0aad8f41a4464a23d3dcf3d299859bfe084
3
+ size 604
gaudio/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d5b662e421ea9fac075174bb0688ee0d9431699900b90662acd44b2a350503a
3
+ size 695
gaudio/text_encoder.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea9e501c7b83c1fc4f2d2d9ac0c089c4a64029211183a4b3d64394af35f1e7a7
3
+ size 435643029
gaudio/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d241a60d5e8f04cc1b2b3e9ef7a4921b27bf526d9f6050ab90f9267a1f9e5c66
3
+ size 711396
gaudio/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba7f777cdd4b947ffa1428b898637bfd00ec9c8fc09fc416645881e5b3dc323c
3
+ size 1289
gaudio/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
gdino/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89ca35a59ea32610b61561b3dc07c785ff539634223097306b20bd3581a7ce4a
3
+ size 1789
gdino/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f29e24728239929bfc149fb5fe238b3707fddd6048d87d8eaabb38181c79a8b
3
+ size 933400776
gdino_processor/preprocessor_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c17fd68afb1f124bfb87a494d409925eec4201b3487c8a694dde64d9ce7109a3
3
+ size 511
gdino_processor/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d5b662e421ea9fac075174bb0688ee0d9431699900b90662acd44b2a350503a
3
+ size 695
gdino_processor/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d241a60d5e8f04cc1b2b3e9ef7a4921b27bf526d9f6050ab90f9267a1f9e5c66
3
+ size 711396
gdino_processor/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d40ab645b68211910b9170d22433d43186a6ec8ee6fd10ba170524b25bf4fb56
3
+ size 1237
gdino_processor/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
gsam2_image/gsam2_image.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b3b1bc43ac04d206b52dc70e9032dc8ed55982d47c8ff5619234c9f349e6d6b
3
+ size 898072910
gsam2_image/gsam2_image.yaml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # Model
4
+ model:
5
+ _target_: modules.gsam2.sam2.sam2_base.SAM2Base
6
+ image_encoder:
7
+ _target_: modules.gsam2.sam2.image_encoder.ImageEncoder
8
+ scalp: 1
9
+ trunk:
10
+ _target_: modules.gsam2.sam2.hieradet.Hiera
11
+ embed_dim: 144
12
+ num_heads: 2
13
+ stages: [2, 6, 36, 4]
14
+ global_att_blocks: [23, 33, 43]
15
+ window_pos_embed_bkg_spatial_size: [7, 7]
16
+ window_spec: [8, 4, 16, 8]
17
+ neck:
18
+ _target_: modules.gsam2.sam2.image_encoder.FpnNeck
19
+ position_encoding:
20
+ _target_: modules.gsam2.sam2.position_encoding.PositionEmbeddingSine
21
+ num_pos_feats: 256
22
+ normalize: true
23
+ scale: null
24
+ temperature: 10000
25
+ d_model: 256
26
+ backbone_channel_list: [1152, 576, 288, 144]
27
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
28
+ fpn_interp_model: nearest
29
+
30
+ memory_attention:
31
+ _target_: modules.gsam2.sam2.memory_attention.MemoryAttention
32
+ d_model: 256
33
+ pos_enc_at_input: true
34
+ layer:
35
+ _target_: modules.gsam2.sam2.memory_attention.MemoryAttentionLayer
36
+ activation: relu
37
+ dim_feedforward: 2048
38
+ dropout: 0.1
39
+ pos_enc_at_attn: false
40
+ self_attention:
41
+ _target_: modules.gsam2.sam2.transformer.RoPEAttention
42
+ rope_theta: 10000.0
43
+ feat_sizes: [32, 32]
44
+ embedding_dim: 256
45
+ num_heads: 1
46
+ downsample_rate: 1
47
+ dropout: 0.1
48
+ d_model: 256
49
+ pos_enc_at_cross_attn_keys: true
50
+ pos_enc_at_cross_attn_queries: false
51
+ cross_attention:
52
+ _target_: modules.gsam2.sam2.transformer.RoPEAttention
53
+ rope_theta: 10000.0
54
+ feat_sizes: [32, 32]
55
+ rope_k_repeat: True
56
+ embedding_dim: 256
57
+ num_heads: 1
58
+ downsample_rate: 1
59
+ dropout: 0.1
60
+ kv_in_dim: 64
61
+ num_layers: 4
62
+
63
+ memory_encoder:
64
+ _target_: modules.gsam2.sam2.memory_encoder.MemoryEncoder
65
+ out_dim: 64
66
+ position_encoding:
67
+ _target_: modules.gsam2.sam2.position_encoding.PositionEmbeddingSine
68
+ num_pos_feats: 64
69
+ normalize: true
70
+ scale: null
71
+ temperature: 10000
72
+ mask_downsampler:
73
+ _target_: modules.gsam2.sam2.memory_encoder.MaskDownSampler
74
+ kernel_size: 3
75
+ stride: 2
76
+ padding: 1
77
+ fuser:
78
+ _target_: modules.gsam2.sam2.memory_encoder.Fuser
79
+ layer:
80
+ _target_: modules.gsam2.sam2.memory_encoder.CXBlock
81
+ dim: 256
82
+ kernel_size: 7
83
+ padding: 3
84
+ layer_scale_init_value: 1e-6
85
+ use_dwconv: True # depth-wise convs
86
+ num_layers: 2
87
+ sam_mask_decoder_extra_args:
88
+ dynamic_multimask_via_stability: true
89
+ dynamic_multimask_stability_delta: 0.05
90
+ dynamic_multimask_stability_thresh: 0.98
91
+ num_maskmem: 7
92
+ image_size: 1024
93
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
94
+ sigmoid_scale_for_mem_enc: 20.0
95
+ sigmoid_bias_for_mem_enc: -10.0
96
+ use_mask_input_as_output_without_sam: true
97
+ # Memory
98
+ directly_add_no_mem_embed: true
99
+ no_obj_embed_spatial: true
100
+ # use high-resolution feature map in the SAM mask decoder
101
+ use_high_res_features_in_sam: true
102
+ # output 3 masks on the first click on initial conditioning frames
103
+ multimask_output_in_sam: true
104
+ # SAM heads
105
+ iou_prediction_use_sigmoid: True
106
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
107
+ use_obj_ptrs_in_encoder: true
108
+ add_tpos_enc_to_obj_ptrs: true
109
+ proj_tpos_enc_in_obj_ptrs: true
110
+ use_signed_tpos_enc_to_obj_ptrs: true
111
+ only_obj_ptrs_in_the_past_for_eval: true
112
+ # object occlusion prediction
113
+ pred_obj_scores: true
114
+ pred_obj_scores_mlp: true
115
+ fixed_no_obj_ptr: true
116
+ # multimask tracking settings
117
+ multimask_output_for_tracking: true
118
+ use_multimask_token_for_obj_ptr: true
119
+ multimask_min_pt_num: 0
120
+ multimask_max_pt_num: 1
121
+ use_mlp_for_obj_ptr_proj: true
122
+ # Compilation flag
123
+ compile_image_encoder: False
gsam2_video/gsam2_video.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b3b1bc43ac04d206b52dc70e9032dc8ed55982d47c8ff5619234c9f349e6d6b
3
+ size 898072910
gsam2_video/gsam2_video.yaml ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: modules.gsam2.sam2_video_predictor.SAM2VideoPredictor
3
+ image_encoder:
4
+ _target_: modules.gsam2.sam2.image_encoder.ImageEncoder
5
+ scalp: 1
6
+ trunk:
7
+ _target_: modules.gsam2.sam2.hieradet.Hiera
8
+ embed_dim: 144
9
+ num_heads: 2
10
+ stages: [2, 6, 36, 4]
11
+ global_att_blocks: [23, 33, 43]
12
+ window_pos_embed_bkg_spatial_size: [7, 7]
13
+ window_spec: [8, 4, 16, 8]
14
+ neck:
15
+ _target_: modules.gsam2.sam2.image_encoder.FpnNeck
16
+ position_encoding:
17
+ _target_: modules.gsam2.sam2.position_encoding.PositionEmbeddingSine
18
+ num_pos_feats: 256
19
+ normalize: true
20
+ scale: null
21
+ temperature: 10000
22
+ d_model: 256
23
+ backbone_channel_list: [1152, 576, 288, 144]
24
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
25
+ fpn_interp_model: nearest
26
+
27
+ memory_attention:
28
+ _target_: modules.gsam2.sam2.memory_attention.MemoryAttention
29
+ d_model: 256
30
+ pos_enc_at_input: true
31
+ layer:
32
+ _target_: modules.gsam2.sam2.memory_attention.MemoryAttentionLayer
33
+ activation: relu
34
+ dim_feedforward: 2048
35
+ dropout: 0.1
36
+ pos_enc_at_attn: false
37
+ self_attention:
38
+ _target_: modules.gsam2.sam2.transformer.RoPEAttention
39
+ rope_theta: 10000.0
40
+ feat_sizes: [32, 32]
41
+ embedding_dim: 256
42
+ num_heads: 1
43
+ downsample_rate: 1
44
+ dropout: 0.1
45
+ d_model: 256
46
+ pos_enc_at_cross_attn_keys: true
47
+ pos_enc_at_cross_attn_queries: false
48
+ cross_attention:
49
+ _target_: modules.gsam2.sam2.transformer.RoPEAttention
50
+ rope_theta: 10000.0
51
+ feat_sizes: [32, 32]
52
+ rope_k_repeat: True
53
+ embedding_dim: 256
54
+ num_heads: 1
55
+ downsample_rate: 1
56
+ dropout: 0.1
57
+ kv_in_dim: 64
58
+ num_layers: 4
59
+
60
+ memory_encoder:
61
+ _target_: modules.gsam2.sam2.memory_encoder.MemoryEncoder
62
+ out_dim: 64
63
+ position_encoding:
64
+ _target_: modules.gsam2.sam2.position_encoding.PositionEmbeddingSine
65
+ num_pos_feats: 64
66
+ normalize: true
67
+ scale: null
68
+ temperature: 10000
69
+ mask_downsampler:
70
+ _target_: modules.gsam2.sam2.memory_encoder.MaskDownSampler
71
+ kernel_size: 3
72
+ stride: 2
73
+ padding: 1
74
+ fuser:
75
+ _target_: modules.gsam2.sam2.memory_encoder.Fuser
76
+ layer:
77
+ _target_: modules.gsam2.sam2.memory_encoder.CXBlock
78
+ dim: 256
79
+ kernel_size: 7
80
+ padding: 3
81
+ layer_scale_init_value: 1e-6
82
+ use_dwconv: True # depth-wise convs
83
+ num_layers: 2
84
+
85
+ sam_mask_decoder_extra_args:
86
+ dynamic_multimask_via_stability: true
87
+ dynamic_multimask_stability_delta: 0.05
88
+ dynamic_multimask_stability_thresh: 0.98
89
+ # the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking
90
+ binarize_mask_from_pts_for_mem_enc: true
91
+ # fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution)
92
+ fill_hole_area: 8
93
+ num_maskmem: 7
94
+ image_size: 1024
95
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
96
+ sigmoid_scale_for_mem_enc: 20.0
97
+ sigmoid_bias_for_mem_enc: -10.0
98
+ use_mask_input_as_output_without_sam: true
99
+ # Memory
100
+ directly_add_no_mem_embed: true
101
+ no_obj_embed_spatial: true
102
+ # use high-resolution feature map in the SAM mask decoder
103
+ use_high_res_features_in_sam: true
104
+ # output 3 masks on the first click on initial conditioning frames
105
+ multimask_output_in_sam: true
106
+ # SAM heads
107
+ iou_prediction_use_sigmoid: True
108
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
109
+ use_obj_ptrs_in_encoder: true
110
+ add_tpos_enc_to_obj_ptrs: true
111
+ proj_tpos_enc_in_obj_ptrs: true
112
+ use_signed_tpos_enc_to_obj_ptrs: true
113
+ only_obj_ptrs_in_the_past_for_eval: true
114
+ # object occlusion prediction
115
+ pred_obj_scores: true
116
+ pred_obj_scores_mlp: true
117
+ fixed_no_obj_ptr: true
118
+ # multimask tracking settings
119
+ multimask_output_for_tracking: true
120
+ use_multimask_token_for_obj_ptr: true
121
+ multimask_min_pt_num: 0
122
+ multimask_max_pt_num: 1
123
+ use_mlp_for_obj_ptr_proj: true
124
+ # Compilation flag
125
+ compile_image_encoder: False
llama/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f7941a3d70fc216b82de7df97863937dcd33e644416665b89dbdc67b1daa66a
3
+ size 928
llama/generation_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d4beaeab19fc8c2d4b0725864a97f73bf787b1bd4bec222cc9e1e6a8030ca9c
3
+ size 230
llama/model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:477ff93289f45ff953aa9f7ce6331d5fce4ee466f53865301333ea0ccece72dd
3
+ size 4976698592
llama/model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8899cf9adf52f3fa093a6469f4efaecf66d9b62317eddde424ae8358d704a05b
3
+ size 4999802616
llama/model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe29f6c7b95a76fafb3f9dcd614f036f41f44c19ff2e4cf58a9e2ccf442e10b1
3
+ size 4915916080
llama/model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:013bc9e3d5d06232d8a358ad3cd323b732255d31e2debe3bf0c6325b88110ae0
3
+ size 1168138808
llama/model.safetensors.index.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:146776fce3f6db1103aa6f249e65ee5544c5923ce6f971b092eee79aa6e5d37b
3
+ size 23950
sensevoice/am.mvn ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <Nnet>
2
+ <Splice> 400 400
3
+ [ 0 ]
4
+ <AddShift> 400 400
5
+ <LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
6
+ <Rescale> 400 400
7
+ <LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
8
+ </Nnet>
sensevoice/chn_jpn_yue_eng_ko_spectok.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa87f86064c3730d799ddf7af3c04659151102cba548bce325cf06ba4da4e6a8
3
+ size 377341
sensevoice/sensevoice.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6280b847a0ec99377af351999a2d2223d005dd3161d4005328db35820d6f91d
3
+ size 938091356
sensevoice/sensevoice.yaml ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: modules.sensevoice.model.SenseVoice
3
+ model:
4
+ _target_: modules.sensevoice.sensevoicesmall.SenseVoiceSmall
5
+ specaug:
6
+ _target_: modules.sensevoice.specaug.SpecAugLFR
7
+ apply_freq_mask: true
8
+ apply_time_mask: true
9
+ apply_time_warp: false
10
+ freq_mask_width_range: [0, 30]
11
+ lfr_rate: 6
12
+ num_freq_mask: 1
13
+ num_time_mask: 1
14
+ time_mask_width_range: [0, 12]
15
+ time_warp_mode: bicubic
16
+ time_warp_window: 5
17
+ encoder:
18
+ _target_: modules.sensevoice.sensevoicesmall.SenseVoiceEncoderSmall
19
+ attention_dropout_rate: 0.1
20
+ attention_heads: 4
21
+ dropout_rate: 0.1
22
+ kernel_size: 11
23
+ linear_units: 2048
24
+ normalize_before: true
25
+ num_blocks: 50
26
+ output_size: 512
27
+ sanm_shfit: 0
28
+ tp_blocks: 20
29
+ input_size: 560
30
+ tokenizer:
31
+ _target_: modules.sensevoice.tokenizer.SentencepiecesTokenizer
32
+ bpemodel: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/chn_jpn_yue_eng_ko_spectok.bpe.model
33
+ unk_symbol: <unk>
34
+ split_with_space: true
35
+ frontend:
36
+ _target_: modules.sensevoice.frontend.WavFrontend
37
+ fs: 16000
38
+ window: hamming
39
+ n_mels: 80
40
+ frame_length: 25
41
+ frame_shift: 10
42
+ lfr_m: 7
43
+ lfr_n: 6
44
+ length_normalized_loss: true
45
+ input_size: 560
46
+ vocab_size: 25055
47
+ sos: 1
48
+ eos: 2
49
+ ignore_id: -1
50
+ vad_model:
51
+ _target_: modules.sensevoice.vad.FsmnVADStreaming
52
+ encoder:
53
+ _target_: modules.sensevoice.vad.FSMN
54
+ input_dim: 400
55
+ input_affine_dim: 140
56
+ fsmn_layers: 4
57
+ linear_dim: 250
58
+ proj_dim: 128
59
+ lorder: 20
60
+ rorder: 0
61
+ lstride: 1
62
+ rstride: 0
63
+ output_affine_dim: 140
64
+ output_dim: 248
65
+ frontend:
66
+ _target_: modules.sensevoice.frontend.WavFrontendOnline
67
+ fs: 16000
68
+ window: hamming
69
+ n_mels: 80
70
+ frame_length: 25
71
+ frame_shift: 10
72
+ dither: 0.0
73
+ lfr_m: 5
74
+ lfr_n: 1
75
+ cmvn_file: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/am.mvn
76
+ sample_rate: 16000
77
+ detect_mode: 1
78
+ snr_mode: 0
79
+ max_end_silence_time: 800
80
+ max_start_silence_time: 3000
81
+ do_start_point_detection: True
82
+ do_end_point_detection: True
83
+ window_size_ms: 200
84
+ sil_to_speech_time_thres: 150
85
+ speech_to_sil_time_thres: 150
86
+ speech_2_noise_ratio: 1.0
87
+ do_extend: 1
88
+ lookback_time_start_point: 200
89
+ lookahead_time_end_point: 100
90
+ max_single_segment_time: 60000
91
+ snr_thres: -100.0
92
+ noise_frame_num_used_for_snr: 100
93
+ decibel_thres: -100.0
94
+ speech_noise_thres: 0.6
95
+ fe_prior_thres: 0.0001
96
+ silence_pdf_num: 1
97
+ sil_pdf_ids: [0]
98
+ speech_noise_thresh_low: -0.1
99
+ speech_noise_thresh_high: 0.3
100
+ output_frame_probs: False
101
+ frame_in_ms: 10
102
+ frame_length_ms: 25
103
+ tokenizer: None
104
+ vocab_size: -1
105
+ input_size: 400
106
+ kwargs:
107
+ specaug:
108
+ apply_freq_mask: true
109
+ apply_time_mask: true
110
+ apply_time_warp: false
111
+ freq_mask_width_range: [0, 30]
112
+ lfr_rate: 6
113
+ num_freq_mask: 1
114
+ num_time_mask: 1
115
+ time_mask_width_range: [0, 12]
116
+ time_warp_mode: bicubic
117
+ time_warp_window: 5
118
+ encoder:
119
+ attention_dropout_rate: 0.1
120
+ attention_heads: 4
121
+ dropout_rate: 0.1
122
+ kernel_size: 11
123
+ linear_units: 2048
124
+ normalize_before: true
125
+ num_blocks: 50
126
+ output_size: 512
127
+ sanm_shfit: 0
128
+ tp_blocks: 20
129
+ input_size: 560
130
+ tokenizer:
131
+ bpemodel: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/chn_jpn_yue_eng_ko_spectok.bpe.model
132
+ unk_symbol: <unk>
133
+ split_with_space: true
134
+ frontend:
135
+ fs: 16000
136
+ window: hamming
137
+ n_mels: 80
138
+ frame_length: 25
139
+ frame_shift: 10
140
+ lfr_m: 7
141
+ lfr_n: 6
142
+ length_normalized_loss: true
143
+ input_size: 560
144
+ vocab_size: 25055
145
+ sos: 1
146
+ eos: 2
147
+ ignore_id: -1
148
+ vad_kwargs:
149
+ encoder:
150
+ input_dim: 400
151
+ input_affine_dim: 140
152
+ fsmn_layers: 4
153
+ linear_dim: 250
154
+ proj_dim: 128
155
+ lorder: 20
156
+ rorder: 0
157
+ lstride: 1
158
+ rstride: 0
159
+ output_affine_dim: 140
160
+ output_dim: 248
161
+ frontend:
162
+ fs: 16000
163
+ window: hamming
164
+ n_mels: 80
165
+ frame_length: 25
166
+ frame_shift: 10
167
+ dither: 0.0
168
+ lfr_m: 5
169
+ lfr_n: 1
170
+ cmvn_file: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/am.mvn
171
+ sample_rate: 16000
172
+ detect_mode: 1
173
+ snr_mode: 0
174
+ max_end_silence_time: 800
175
+ max_start_silence_time: 3000
176
+ do_start_point_detection: True
177
+ do_end_point_detection: True
178
+ window_size_ms: 200
179
+ sil_to_speech_time_thres: 150
180
+ speech_to_sil_time_thres: 150
181
+ speech_2_noise_ratio: 1.0
182
+ do_extend: 1
183
+ lookback_time_start_point: 200
184
+ lookahead_time_end_point: 100
185
+ max_single_segment_time: 60000
186
+ snr_thres: -100.0
187
+ noise_frame_num_used_for_snr: 100
188
+ decibel_thres: -100.0
189
+ speech_noise_thres: 0.6
190
+ fe_prior_thres: 0.0001
191
+ silence_pdf_num: 1
192
+ sil_pdf_ids: [0]
193
+ speech_noise_thresh_low: -0.1
194
+ speech_noise_thresh_high: 0.3
195
+ output_frame_probs: False
196
+ frame_in_ms: 10
197
+ frame_length_ms: 25
198
+ tokenizer: None
199
+ vocab_size: -1
200
+ input_size: 400
siglip/siglip.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3afbb189b33a42a2ef804a388fa88aea293741949a6cdd61035eeb094139bff1
3
+ size 1713053358
siglip/siglip.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: modules.siglip.ImageEncoder
3
+ embeddings:
4
+ _target_: modules.siglip.SiglipVisionEmbeddings
5
+ hidden_size: 1152
6
+ image_size: 384
7
+ patch_size: 14
8
+ num_channels: 3
9
+ encoder:
10
+ _target_: modules.siglip.SiglipEncoder
11
+ hidden_size: 1152
12
+ num_attention_heads: 16
13
+ attention_dropout: 0.0
14
+ hidden_act: gelu_pytorch_tanh
15
+ intermediate_size: 4304
16
+ layer_norm_eps: 1e-6
17
+ num_hidden_layers: 27
18
+ head:
19
+ _target_: modules.siglip.SiglipMultiheadAttentionPoolingHead
20
+ mlp:
21
+ _target_: modules.siglip.SiglipMLP
22
+ hidden_act: gelu_pytorch_tanh
23
+ hidden_size: 1152
24
+ intermediate_size: 4304
25
+ hidden_size: 1152
26
+ num_attention_heads: 16
27
+ layer_norm_eps: 1e-6
28
+ hidden_size: 1152
29
+ layer_norm_eps: 1e-6
sv_encoder/sv_encoder.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75759b2e87f6e8974f86d1d275c6a9d1c96f2a300b32e2c88ee5180331ff558a
3
+ size 884939345
sv_encoder/sv_encoder.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: modules.gaudio.sv_encoder.SenseVoiceSmall
3
+ specaug:
4
+ _target_: modules.gaudio.sv_encoder.SpecAugLFR
5
+ apply_freq_mask: true
6
+ apply_time_mask: true
7
+ apply_time_warp: false
8
+ freq_mask_width_range: [0, 30]
9
+ lfr_rate: 6
10
+ num_freq_mask: 1
11
+ num_time_mask: 1
12
+ time_mask_width_range: [0, 12]
13
+ time_warp_mode: bicubic
14
+ time_warp_window: 5
15
+ encoder:
16
+ _target_: modules.gaudio.sv_encoder.SenseVoiceEncoderSmall
17
+ attention_dropout_rate: 0.1
18
+ attention_heads: 4
19
+ dropout_rate: 0.1
20
+ kernel_size: 11
21
+ linear_units: 2048
22
+ normalize_before: true
23
+ num_blocks: 50
24
+ output_size: 512
25
+ sanm_shfit: 0
26
+ tp_blocks: 20
27
+ input_size: 560
28
+ length_normalized_loss: true
29
+ input_size: 560
30
+ vocab_size: 25055
31
+ sos: 1
32
+ eos: 2
33
+ ignore_id: -1
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c73565d76ce52078f48b8cd9a5ab59615c5519704c2fa6f52c5f6309915a70e7
3
+ size 480
tokenizer/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:200a2a14d4927029e80613d8f4f770359e5910d931906c5d1d195dcdae9c6828
3
+ size 17209899
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95faea2e162d6f8336ccaebaff5e94b779e279673c8bb13a7f0d427162fb7ef7
3
+ size 55400