Commit
·
5e0c4e5
1
Parent(s):
42ed155
zjt
Browse files- .gitattributes +2 -0
- gaudio/audio_encoder.pt +3 -0
- gaudio/config.json +3 -0
- gaudio/preprocessor_config.json +3 -0
- gaudio/special_tokens_map.json +3 -0
- gaudio/text_encoder.pt +3 -0
- gaudio/tokenizer.json +3 -0
- gaudio/tokenizer_config.json +3 -0
- gaudio/vocab.txt +0 -0
- gdino/config.json +3 -0
- gdino/model.safetensors +3 -0
- gdino_processor/preprocessor_config.json +3 -0
- gdino_processor/special_tokens_map.json +3 -0
- gdino_processor/tokenizer.json +3 -0
- gdino_processor/tokenizer_config.json +3 -0
- gdino_processor/vocab.txt +0 -0
- gsam2_image/gsam2_image.pt +3 -0
- gsam2_image/gsam2_image.yaml +123 -0
- gsam2_video/gsam2_video.pt +3 -0
- gsam2_video/gsam2_video.yaml +125 -0
- llama/config.json +3 -0
- llama/generation_config.json +3 -0
- llama/model-00001-of-00004.safetensors +3 -0
- llama/model-00002-of-00004.safetensors +3 -0
- llama/model-00003-of-00004.safetensors +3 -0
- llama/model-00004-of-00004.safetensors +3 -0
- llama/model.safetensors.index.json +3 -0
- sensevoice/am.mvn +8 -0
- sensevoice/chn_jpn_yue_eng_ko_spectok.bpe.model +3 -0
- sensevoice/sensevoice.pt +3 -0
- sensevoice/sensevoice.yaml +200 -0
- siglip/siglip.pt +3 -0
- siglip/siglip.yaml +29 -0
- sv_encoder/sv_encoder.pt +3 -0
- sv_encoder/sv_encoder.yaml +33 -0
- tokenizer/special_tokens_map.json +3 -0
- tokenizer/tokenizer.json +3 -0
- tokenizer/tokenizer_config.json +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.jsonl filter=lfs diff=lfs merge=lfs -text
|
gaudio/audio_encoder.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:75759b2e87f6e8974f86d1d275c6a9d1c96f2a300b32e2c88ee5180331ff558a
|
3 |
+
size 884939345
|
gaudio/config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de7152eec4e7d795e9c2855fa971dcb22af49fdab01bd2177c41bb6ae5c64fdf
|
3 |
+
size 2585
|
gaudio/preprocessor_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba6754d6c8d51e343d5cd431549cc0aad8f41a4464a23d3dcf3d299859bfe084
|
3 |
+
size 604
|
gaudio/special_tokens_map.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d5b662e421ea9fac075174bb0688ee0d9431699900b90662acd44b2a350503a
|
3 |
+
size 695
|
gaudio/text_encoder.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ea9e501c7b83c1fc4f2d2d9ac0c089c4a64029211183a4b3d64394af35f1e7a7
|
3 |
+
size 435643029
|
gaudio/tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d241a60d5e8f04cc1b2b3e9ef7a4921b27bf526d9f6050ab90f9267a1f9e5c66
|
3 |
+
size 711396
|
gaudio/tokenizer_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba7f777cdd4b947ffa1428b898637bfd00ec9c8fc09fc416645881e5b3dc323c
|
3 |
+
size 1289
|
gaudio/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
gdino/config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89ca35a59ea32610b61561b3dc07c785ff539634223097306b20bd3581a7ce4a
|
3 |
+
size 1789
|
gdino/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4f29e24728239929bfc149fb5fe238b3707fddd6048d87d8eaabb38181c79a8b
|
3 |
+
size 933400776
|
gdino_processor/preprocessor_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c17fd68afb1f124bfb87a494d409925eec4201b3487c8a694dde64d9ce7109a3
|
3 |
+
size 511
|
gdino_processor/special_tokens_map.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d5b662e421ea9fac075174bb0688ee0d9431699900b90662acd44b2a350503a
|
3 |
+
size 695
|
gdino_processor/tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d241a60d5e8f04cc1b2b3e9ef7a4921b27bf526d9f6050ab90f9267a1f9e5c66
|
3 |
+
size 711396
|
gdino_processor/tokenizer_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d40ab645b68211910b9170d22433d43186a6ec8ee6fd10ba170524b25bf4fb56
|
3 |
+
size 1237
|
gdino_processor/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
gsam2_image/gsam2_image.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2b3b1bc43ac04d206b52dc70e9032dc8ed55982d47c8ff5619234c9f349e6d6b
|
3 |
+
size 898072910
|
gsam2_image/gsam2_image.yaml
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
# Model
|
4 |
+
model:
|
5 |
+
_target_: modules.gsam2.sam2.sam2_base.SAM2Base
|
6 |
+
image_encoder:
|
7 |
+
_target_: modules.gsam2.sam2.image_encoder.ImageEncoder
|
8 |
+
scalp: 1
|
9 |
+
trunk:
|
10 |
+
_target_: modules.gsam2.sam2.hieradet.Hiera
|
11 |
+
embed_dim: 144
|
12 |
+
num_heads: 2
|
13 |
+
stages: [2, 6, 36, 4]
|
14 |
+
global_att_blocks: [23, 33, 43]
|
15 |
+
window_pos_embed_bkg_spatial_size: [7, 7]
|
16 |
+
window_spec: [8, 4, 16, 8]
|
17 |
+
neck:
|
18 |
+
_target_: modules.gsam2.sam2.image_encoder.FpnNeck
|
19 |
+
position_encoding:
|
20 |
+
_target_: modules.gsam2.sam2.position_encoding.PositionEmbeddingSine
|
21 |
+
num_pos_feats: 256
|
22 |
+
normalize: true
|
23 |
+
scale: null
|
24 |
+
temperature: 10000
|
25 |
+
d_model: 256
|
26 |
+
backbone_channel_list: [1152, 576, 288, 144]
|
27 |
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
28 |
+
fpn_interp_model: nearest
|
29 |
+
|
30 |
+
memory_attention:
|
31 |
+
_target_: modules.gsam2.sam2.memory_attention.MemoryAttention
|
32 |
+
d_model: 256
|
33 |
+
pos_enc_at_input: true
|
34 |
+
layer:
|
35 |
+
_target_: modules.gsam2.sam2.memory_attention.MemoryAttentionLayer
|
36 |
+
activation: relu
|
37 |
+
dim_feedforward: 2048
|
38 |
+
dropout: 0.1
|
39 |
+
pos_enc_at_attn: false
|
40 |
+
self_attention:
|
41 |
+
_target_: modules.gsam2.sam2.transformer.RoPEAttention
|
42 |
+
rope_theta: 10000.0
|
43 |
+
feat_sizes: [32, 32]
|
44 |
+
embedding_dim: 256
|
45 |
+
num_heads: 1
|
46 |
+
downsample_rate: 1
|
47 |
+
dropout: 0.1
|
48 |
+
d_model: 256
|
49 |
+
pos_enc_at_cross_attn_keys: true
|
50 |
+
pos_enc_at_cross_attn_queries: false
|
51 |
+
cross_attention:
|
52 |
+
_target_: modules.gsam2.sam2.transformer.RoPEAttention
|
53 |
+
rope_theta: 10000.0
|
54 |
+
feat_sizes: [32, 32]
|
55 |
+
rope_k_repeat: True
|
56 |
+
embedding_dim: 256
|
57 |
+
num_heads: 1
|
58 |
+
downsample_rate: 1
|
59 |
+
dropout: 0.1
|
60 |
+
kv_in_dim: 64
|
61 |
+
num_layers: 4
|
62 |
+
|
63 |
+
memory_encoder:
|
64 |
+
_target_: modules.gsam2.sam2.memory_encoder.MemoryEncoder
|
65 |
+
out_dim: 64
|
66 |
+
position_encoding:
|
67 |
+
_target_: modules.gsam2.sam2.position_encoding.PositionEmbeddingSine
|
68 |
+
num_pos_feats: 64
|
69 |
+
normalize: true
|
70 |
+
scale: null
|
71 |
+
temperature: 10000
|
72 |
+
mask_downsampler:
|
73 |
+
_target_: modules.gsam2.sam2.memory_encoder.MaskDownSampler
|
74 |
+
kernel_size: 3
|
75 |
+
stride: 2
|
76 |
+
padding: 1
|
77 |
+
fuser:
|
78 |
+
_target_: modules.gsam2.sam2.memory_encoder.Fuser
|
79 |
+
layer:
|
80 |
+
_target_: modules.gsam2.sam2.memory_encoder.CXBlock
|
81 |
+
dim: 256
|
82 |
+
kernel_size: 7
|
83 |
+
padding: 3
|
84 |
+
layer_scale_init_value: 1e-6
|
85 |
+
use_dwconv: True # depth-wise convs
|
86 |
+
num_layers: 2
|
87 |
+
sam_mask_decoder_extra_args:
|
88 |
+
dynamic_multimask_via_stability: true
|
89 |
+
dynamic_multimask_stability_delta: 0.05
|
90 |
+
dynamic_multimask_stability_thresh: 0.98
|
91 |
+
num_maskmem: 7
|
92 |
+
image_size: 1024
|
93 |
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
94 |
+
sigmoid_scale_for_mem_enc: 20.0
|
95 |
+
sigmoid_bias_for_mem_enc: -10.0
|
96 |
+
use_mask_input_as_output_without_sam: true
|
97 |
+
# Memory
|
98 |
+
directly_add_no_mem_embed: true
|
99 |
+
no_obj_embed_spatial: true
|
100 |
+
# use high-resolution feature map in the SAM mask decoder
|
101 |
+
use_high_res_features_in_sam: true
|
102 |
+
# output 3 masks on the first click on initial conditioning frames
|
103 |
+
multimask_output_in_sam: true
|
104 |
+
# SAM heads
|
105 |
+
iou_prediction_use_sigmoid: True
|
106 |
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
107 |
+
use_obj_ptrs_in_encoder: true
|
108 |
+
add_tpos_enc_to_obj_ptrs: true
|
109 |
+
proj_tpos_enc_in_obj_ptrs: true
|
110 |
+
use_signed_tpos_enc_to_obj_ptrs: true
|
111 |
+
only_obj_ptrs_in_the_past_for_eval: true
|
112 |
+
# object occlusion prediction
|
113 |
+
pred_obj_scores: true
|
114 |
+
pred_obj_scores_mlp: true
|
115 |
+
fixed_no_obj_ptr: true
|
116 |
+
# multimask tracking settings
|
117 |
+
multimask_output_for_tracking: true
|
118 |
+
use_multimask_token_for_obj_ptr: true
|
119 |
+
multimask_min_pt_num: 0
|
120 |
+
multimask_max_pt_num: 1
|
121 |
+
use_mlp_for_obj_ptr_proj: true
|
122 |
+
# Compilation flag
|
123 |
+
compile_image_encoder: False
|
gsam2_video/gsam2_video.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2b3b1bc43ac04d206b52dc70e9032dc8ed55982d47c8ff5619234c9f349e6d6b
|
3 |
+
size 898072910
|
gsam2_video/gsam2_video.yaml
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
_target_: modules.gsam2.sam2_video_predictor.SAM2VideoPredictor
|
3 |
+
image_encoder:
|
4 |
+
_target_: modules.gsam2.sam2.image_encoder.ImageEncoder
|
5 |
+
scalp: 1
|
6 |
+
trunk:
|
7 |
+
_target_: modules.gsam2.sam2.hieradet.Hiera
|
8 |
+
embed_dim: 144
|
9 |
+
num_heads: 2
|
10 |
+
stages: [2, 6, 36, 4]
|
11 |
+
global_att_blocks: [23, 33, 43]
|
12 |
+
window_pos_embed_bkg_spatial_size: [7, 7]
|
13 |
+
window_spec: [8, 4, 16, 8]
|
14 |
+
neck:
|
15 |
+
_target_: modules.gsam2.sam2.image_encoder.FpnNeck
|
16 |
+
position_encoding:
|
17 |
+
_target_: modules.gsam2.sam2.position_encoding.PositionEmbeddingSine
|
18 |
+
num_pos_feats: 256
|
19 |
+
normalize: true
|
20 |
+
scale: null
|
21 |
+
temperature: 10000
|
22 |
+
d_model: 256
|
23 |
+
backbone_channel_list: [1152, 576, 288, 144]
|
24 |
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
25 |
+
fpn_interp_model: nearest
|
26 |
+
|
27 |
+
memory_attention:
|
28 |
+
_target_: modules.gsam2.sam2.memory_attention.MemoryAttention
|
29 |
+
d_model: 256
|
30 |
+
pos_enc_at_input: true
|
31 |
+
layer:
|
32 |
+
_target_: modules.gsam2.sam2.memory_attention.MemoryAttentionLayer
|
33 |
+
activation: relu
|
34 |
+
dim_feedforward: 2048
|
35 |
+
dropout: 0.1
|
36 |
+
pos_enc_at_attn: false
|
37 |
+
self_attention:
|
38 |
+
_target_: modules.gsam2.sam2.transformer.RoPEAttention
|
39 |
+
rope_theta: 10000.0
|
40 |
+
feat_sizes: [32, 32]
|
41 |
+
embedding_dim: 256
|
42 |
+
num_heads: 1
|
43 |
+
downsample_rate: 1
|
44 |
+
dropout: 0.1
|
45 |
+
d_model: 256
|
46 |
+
pos_enc_at_cross_attn_keys: true
|
47 |
+
pos_enc_at_cross_attn_queries: false
|
48 |
+
cross_attention:
|
49 |
+
_target_: modules.gsam2.sam2.transformer.RoPEAttention
|
50 |
+
rope_theta: 10000.0
|
51 |
+
feat_sizes: [32, 32]
|
52 |
+
rope_k_repeat: True
|
53 |
+
embedding_dim: 256
|
54 |
+
num_heads: 1
|
55 |
+
downsample_rate: 1
|
56 |
+
dropout: 0.1
|
57 |
+
kv_in_dim: 64
|
58 |
+
num_layers: 4
|
59 |
+
|
60 |
+
memory_encoder:
|
61 |
+
_target_: modules.gsam2.sam2.memory_encoder.MemoryEncoder
|
62 |
+
out_dim: 64
|
63 |
+
position_encoding:
|
64 |
+
_target_: modules.gsam2.sam2.position_encoding.PositionEmbeddingSine
|
65 |
+
num_pos_feats: 64
|
66 |
+
normalize: true
|
67 |
+
scale: null
|
68 |
+
temperature: 10000
|
69 |
+
mask_downsampler:
|
70 |
+
_target_: modules.gsam2.sam2.memory_encoder.MaskDownSampler
|
71 |
+
kernel_size: 3
|
72 |
+
stride: 2
|
73 |
+
padding: 1
|
74 |
+
fuser:
|
75 |
+
_target_: modules.gsam2.sam2.memory_encoder.Fuser
|
76 |
+
layer:
|
77 |
+
_target_: modules.gsam2.sam2.memory_encoder.CXBlock
|
78 |
+
dim: 256
|
79 |
+
kernel_size: 7
|
80 |
+
padding: 3
|
81 |
+
layer_scale_init_value: 1e-6
|
82 |
+
use_dwconv: True # depth-wise convs
|
83 |
+
num_layers: 2
|
84 |
+
|
85 |
+
sam_mask_decoder_extra_args:
|
86 |
+
dynamic_multimask_via_stability: true
|
87 |
+
dynamic_multimask_stability_delta: 0.05
|
88 |
+
dynamic_multimask_stability_thresh: 0.98
|
89 |
+
# the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking
|
90 |
+
binarize_mask_from_pts_for_mem_enc: true
|
91 |
+
# fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution)
|
92 |
+
fill_hole_area: 8
|
93 |
+
num_maskmem: 7
|
94 |
+
image_size: 1024
|
95 |
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
96 |
+
sigmoid_scale_for_mem_enc: 20.0
|
97 |
+
sigmoid_bias_for_mem_enc: -10.0
|
98 |
+
use_mask_input_as_output_without_sam: true
|
99 |
+
# Memory
|
100 |
+
directly_add_no_mem_embed: true
|
101 |
+
no_obj_embed_spatial: true
|
102 |
+
# use high-resolution feature map in the SAM mask decoder
|
103 |
+
use_high_res_features_in_sam: true
|
104 |
+
# output 3 masks on the first click on initial conditioning frames
|
105 |
+
multimask_output_in_sam: true
|
106 |
+
# SAM heads
|
107 |
+
iou_prediction_use_sigmoid: True
|
108 |
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
109 |
+
use_obj_ptrs_in_encoder: true
|
110 |
+
add_tpos_enc_to_obj_ptrs: true
|
111 |
+
proj_tpos_enc_in_obj_ptrs: true
|
112 |
+
use_signed_tpos_enc_to_obj_ptrs: true
|
113 |
+
only_obj_ptrs_in_the_past_for_eval: true
|
114 |
+
# object occlusion prediction
|
115 |
+
pred_obj_scores: true
|
116 |
+
pred_obj_scores_mlp: true
|
117 |
+
fixed_no_obj_ptr: true
|
118 |
+
# multimask tracking settings
|
119 |
+
multimask_output_for_tracking: true
|
120 |
+
use_multimask_token_for_obj_ptr: true
|
121 |
+
multimask_min_pt_num: 0
|
122 |
+
multimask_max_pt_num: 1
|
123 |
+
use_mlp_for_obj_ptr_proj: true
|
124 |
+
# Compilation flag
|
125 |
+
compile_image_encoder: False
|
llama/config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0f7941a3d70fc216b82de7df97863937dcd33e644416665b89dbdc67b1daa66a
|
3 |
+
size 928
|
llama/generation_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8d4beaeab19fc8c2d4b0725864a97f73bf787b1bd4bec222cc9e1e6a8030ca9c
|
3 |
+
size 230
|
llama/model-00001-of-00004.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:477ff93289f45ff953aa9f7ce6331d5fce4ee466f53865301333ea0ccece72dd
|
3 |
+
size 4976698592
|
llama/model-00002-of-00004.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8899cf9adf52f3fa093a6469f4efaecf66d9b62317eddde424ae8358d704a05b
|
3 |
+
size 4999802616
|
llama/model-00003-of-00004.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe29f6c7b95a76fafb3f9dcd614f036f41f44c19ff2e4cf58a9e2ccf442e10b1
|
3 |
+
size 4915916080
|
llama/model-00004-of-00004.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:013bc9e3d5d06232d8a358ad3cd323b732255d31e2debe3bf0c6325b88110ae0
|
3 |
+
size 1168138808
|
llama/model.safetensors.index.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:146776fce3f6db1103aa6f249e65ee5544c5923ce6f971b092eee79aa6e5d37b
|
3 |
+
size 23950
|
sensevoice/am.mvn
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<Nnet>
|
2 |
+
<Splice> 400 400
|
3 |
+
[ 0 ]
|
4 |
+
<AddShift> 400 400
|
5 |
+
<LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
|
6 |
+
<Rescale> 400 400
|
7 |
+
<LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
|
8 |
+
</Nnet>
|
sensevoice/chn_jpn_yue_eng_ko_spectok.bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa87f86064c3730d799ddf7af3c04659151102cba548bce325cf06ba4da4e6a8
|
3 |
+
size 377341
|
sensevoice/sensevoice.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f6280b847a0ec99377af351999a2d2223d005dd3161d4005328db35820d6f91d
|
3 |
+
size 938091356
|
sensevoice/sensevoice.yaml
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
_target_: modules.sensevoice.model.SenseVoice
|
3 |
+
model:
|
4 |
+
_target_: modules.sensevoice.sensevoicesmall.SenseVoiceSmall
|
5 |
+
specaug:
|
6 |
+
_target_: modules.sensevoice.specaug.SpecAugLFR
|
7 |
+
apply_freq_mask: true
|
8 |
+
apply_time_mask: true
|
9 |
+
apply_time_warp: false
|
10 |
+
freq_mask_width_range: [0, 30]
|
11 |
+
lfr_rate: 6
|
12 |
+
num_freq_mask: 1
|
13 |
+
num_time_mask: 1
|
14 |
+
time_mask_width_range: [0, 12]
|
15 |
+
time_warp_mode: bicubic
|
16 |
+
time_warp_window: 5
|
17 |
+
encoder:
|
18 |
+
_target_: modules.sensevoice.sensevoicesmall.SenseVoiceEncoderSmall
|
19 |
+
attention_dropout_rate: 0.1
|
20 |
+
attention_heads: 4
|
21 |
+
dropout_rate: 0.1
|
22 |
+
kernel_size: 11
|
23 |
+
linear_units: 2048
|
24 |
+
normalize_before: true
|
25 |
+
num_blocks: 50
|
26 |
+
output_size: 512
|
27 |
+
sanm_shfit: 0
|
28 |
+
tp_blocks: 20
|
29 |
+
input_size: 560
|
30 |
+
tokenizer:
|
31 |
+
_target_: modules.sensevoice.tokenizer.SentencepiecesTokenizer
|
32 |
+
bpemodel: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/chn_jpn_yue_eng_ko_spectok.bpe.model
|
33 |
+
unk_symbol: <unk>
|
34 |
+
split_with_space: true
|
35 |
+
frontend:
|
36 |
+
_target_: modules.sensevoice.frontend.WavFrontend
|
37 |
+
fs: 16000
|
38 |
+
window: hamming
|
39 |
+
n_mels: 80
|
40 |
+
frame_length: 25
|
41 |
+
frame_shift: 10
|
42 |
+
lfr_m: 7
|
43 |
+
lfr_n: 6
|
44 |
+
length_normalized_loss: true
|
45 |
+
input_size: 560
|
46 |
+
vocab_size: 25055
|
47 |
+
sos: 1
|
48 |
+
eos: 2
|
49 |
+
ignore_id: -1
|
50 |
+
vad_model:
|
51 |
+
_target_: modules.sensevoice.vad.FsmnVADStreaming
|
52 |
+
encoder:
|
53 |
+
_target_: modules.sensevoice.vad.FSMN
|
54 |
+
input_dim: 400
|
55 |
+
input_affine_dim: 140
|
56 |
+
fsmn_layers: 4
|
57 |
+
linear_dim: 250
|
58 |
+
proj_dim: 128
|
59 |
+
lorder: 20
|
60 |
+
rorder: 0
|
61 |
+
lstride: 1
|
62 |
+
rstride: 0
|
63 |
+
output_affine_dim: 140
|
64 |
+
output_dim: 248
|
65 |
+
frontend:
|
66 |
+
_target_: modules.sensevoice.frontend.WavFrontendOnline
|
67 |
+
fs: 16000
|
68 |
+
window: hamming
|
69 |
+
n_mels: 80
|
70 |
+
frame_length: 25
|
71 |
+
frame_shift: 10
|
72 |
+
dither: 0.0
|
73 |
+
lfr_m: 5
|
74 |
+
lfr_n: 1
|
75 |
+
cmvn_file: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/am.mvn
|
76 |
+
sample_rate: 16000
|
77 |
+
detect_mode: 1
|
78 |
+
snr_mode: 0
|
79 |
+
max_end_silence_time: 800
|
80 |
+
max_start_silence_time: 3000
|
81 |
+
do_start_point_detection: True
|
82 |
+
do_end_point_detection: True
|
83 |
+
window_size_ms: 200
|
84 |
+
sil_to_speech_time_thres: 150
|
85 |
+
speech_to_sil_time_thres: 150
|
86 |
+
speech_2_noise_ratio: 1.0
|
87 |
+
do_extend: 1
|
88 |
+
lookback_time_start_point: 200
|
89 |
+
lookahead_time_end_point: 100
|
90 |
+
max_single_segment_time: 60000
|
91 |
+
snr_thres: -100.0
|
92 |
+
noise_frame_num_used_for_snr: 100
|
93 |
+
decibel_thres: -100.0
|
94 |
+
speech_noise_thres: 0.6
|
95 |
+
fe_prior_thres: 0.0001
|
96 |
+
silence_pdf_num: 1
|
97 |
+
sil_pdf_ids: [0]
|
98 |
+
speech_noise_thresh_low: -0.1
|
99 |
+
speech_noise_thresh_high: 0.3
|
100 |
+
output_frame_probs: False
|
101 |
+
frame_in_ms: 10
|
102 |
+
frame_length_ms: 25
|
103 |
+
tokenizer: None
|
104 |
+
vocab_size: -1
|
105 |
+
input_size: 400
|
106 |
+
kwargs:
|
107 |
+
specaug:
|
108 |
+
apply_freq_mask: true
|
109 |
+
apply_time_mask: true
|
110 |
+
apply_time_warp: false
|
111 |
+
freq_mask_width_range: [0, 30]
|
112 |
+
lfr_rate: 6
|
113 |
+
num_freq_mask: 1
|
114 |
+
num_time_mask: 1
|
115 |
+
time_mask_width_range: [0, 12]
|
116 |
+
time_warp_mode: bicubic
|
117 |
+
time_warp_window: 5
|
118 |
+
encoder:
|
119 |
+
attention_dropout_rate: 0.1
|
120 |
+
attention_heads: 4
|
121 |
+
dropout_rate: 0.1
|
122 |
+
kernel_size: 11
|
123 |
+
linear_units: 2048
|
124 |
+
normalize_before: true
|
125 |
+
num_blocks: 50
|
126 |
+
output_size: 512
|
127 |
+
sanm_shfit: 0
|
128 |
+
tp_blocks: 20
|
129 |
+
input_size: 560
|
130 |
+
tokenizer:
|
131 |
+
bpemodel: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/chn_jpn_yue_eng_ko_spectok.bpe.model
|
132 |
+
unk_symbol: <unk>
|
133 |
+
split_with_space: true
|
134 |
+
frontend:
|
135 |
+
fs: 16000
|
136 |
+
window: hamming
|
137 |
+
n_mels: 80
|
138 |
+
frame_length: 25
|
139 |
+
frame_shift: 10
|
140 |
+
lfr_m: 7
|
141 |
+
lfr_n: 6
|
142 |
+
length_normalized_loss: true
|
143 |
+
input_size: 560
|
144 |
+
vocab_size: 25055
|
145 |
+
sos: 1
|
146 |
+
eos: 2
|
147 |
+
ignore_id: -1
|
148 |
+
vad_kwargs:
|
149 |
+
encoder:
|
150 |
+
input_dim: 400
|
151 |
+
input_affine_dim: 140
|
152 |
+
fsmn_layers: 4
|
153 |
+
linear_dim: 250
|
154 |
+
proj_dim: 128
|
155 |
+
lorder: 20
|
156 |
+
rorder: 0
|
157 |
+
lstride: 1
|
158 |
+
rstride: 0
|
159 |
+
output_affine_dim: 140
|
160 |
+
output_dim: 248
|
161 |
+
frontend:
|
162 |
+
fs: 16000
|
163 |
+
window: hamming
|
164 |
+
n_mels: 80
|
165 |
+
frame_length: 25
|
166 |
+
frame_shift: 10
|
167 |
+
dither: 0.0
|
168 |
+
lfr_m: 5
|
169 |
+
lfr_n: 1
|
170 |
+
cmvn_file: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/am.mvn
|
171 |
+
sample_rate: 16000
|
172 |
+
detect_mode: 1
|
173 |
+
snr_mode: 0
|
174 |
+
max_end_silence_time: 800
|
175 |
+
max_start_silence_time: 3000
|
176 |
+
do_start_point_detection: True
|
177 |
+
do_end_point_detection: True
|
178 |
+
window_size_ms: 200
|
179 |
+
sil_to_speech_time_thres: 150
|
180 |
+
speech_to_sil_time_thres: 150
|
181 |
+
speech_2_noise_ratio: 1.0
|
182 |
+
do_extend: 1
|
183 |
+
lookback_time_start_point: 200
|
184 |
+
lookahead_time_end_point: 100
|
185 |
+
max_single_segment_time: 60000
|
186 |
+
snr_thres: -100.0
|
187 |
+
noise_frame_num_used_for_snr: 100
|
188 |
+
decibel_thres: -100.0
|
189 |
+
speech_noise_thres: 0.6
|
190 |
+
fe_prior_thres: 0.0001
|
191 |
+
silence_pdf_num: 1
|
192 |
+
sil_pdf_ids: [0]
|
193 |
+
speech_noise_thresh_low: -0.1
|
194 |
+
speech_noise_thresh_high: 0.3
|
195 |
+
output_frame_probs: False
|
196 |
+
frame_in_ms: 10
|
197 |
+
frame_length_ms: 25
|
198 |
+
tokenizer: None
|
199 |
+
vocab_size: -1
|
200 |
+
input_size: 400
|
siglip/siglip.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3afbb189b33a42a2ef804a388fa88aea293741949a6cdd61035eeb094139bff1
|
3 |
+
size 1713053358
|
siglip/siglip.yaml
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
_target_: modules.siglip.ImageEncoder
|
3 |
+
embeddings:
|
4 |
+
_target_: modules.siglip.SiglipVisionEmbeddings
|
5 |
+
hidden_size: 1152
|
6 |
+
image_size: 384
|
7 |
+
patch_size: 14
|
8 |
+
num_channels: 3
|
9 |
+
encoder:
|
10 |
+
_target_: modules.siglip.SiglipEncoder
|
11 |
+
hidden_size: 1152
|
12 |
+
num_attention_heads: 16
|
13 |
+
attention_dropout: 0.0
|
14 |
+
hidden_act: gelu_pytorch_tanh
|
15 |
+
intermediate_size: 4304
|
16 |
+
layer_norm_eps: 1e-6
|
17 |
+
num_hidden_layers: 27
|
18 |
+
head:
|
19 |
+
_target_: modules.siglip.SiglipMultiheadAttentionPoolingHead
|
20 |
+
mlp:
|
21 |
+
_target_: modules.siglip.SiglipMLP
|
22 |
+
hidden_act: gelu_pytorch_tanh
|
23 |
+
hidden_size: 1152
|
24 |
+
intermediate_size: 4304
|
25 |
+
hidden_size: 1152
|
26 |
+
num_attention_heads: 16
|
27 |
+
layer_norm_eps: 1e-6
|
28 |
+
hidden_size: 1152
|
29 |
+
layer_norm_eps: 1e-6
|
sv_encoder/sv_encoder.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:75759b2e87f6e8974f86d1d275c6a9d1c96f2a300b32e2c88ee5180331ff558a
|
3 |
+
size 884939345
|
sv_encoder/sv_encoder.yaml
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
_target_: modules.gaudio.sv_encoder.SenseVoiceSmall
|
3 |
+
specaug:
|
4 |
+
_target_: modules.gaudio.sv_encoder.SpecAugLFR
|
5 |
+
apply_freq_mask: true
|
6 |
+
apply_time_mask: true
|
7 |
+
apply_time_warp: false
|
8 |
+
freq_mask_width_range: [0, 30]
|
9 |
+
lfr_rate: 6
|
10 |
+
num_freq_mask: 1
|
11 |
+
num_time_mask: 1
|
12 |
+
time_mask_width_range: [0, 12]
|
13 |
+
time_warp_mode: bicubic
|
14 |
+
time_warp_window: 5
|
15 |
+
encoder:
|
16 |
+
_target_: modules.gaudio.sv_encoder.SenseVoiceEncoderSmall
|
17 |
+
attention_dropout_rate: 0.1
|
18 |
+
attention_heads: 4
|
19 |
+
dropout_rate: 0.1
|
20 |
+
kernel_size: 11
|
21 |
+
linear_units: 2048
|
22 |
+
normalize_before: true
|
23 |
+
num_blocks: 50
|
24 |
+
output_size: 512
|
25 |
+
sanm_shfit: 0
|
26 |
+
tp_blocks: 20
|
27 |
+
input_size: 560
|
28 |
+
length_normalized_loss: true
|
29 |
+
input_size: 560
|
30 |
+
vocab_size: 25055
|
31 |
+
sos: 1
|
32 |
+
eos: 2
|
33 |
+
ignore_id: -1
|
tokenizer/special_tokens_map.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c73565d76ce52078f48b8cd9a5ab59615c5519704c2fa6f52c5f6309915a70e7
|
3 |
+
size 480
|
tokenizer/tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:200a2a14d4927029e80613d8f4f770359e5910d931906c5d1d195dcdae9c6828
|
3 |
+
size 17209899
|
tokenizer/tokenizer_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:95faea2e162d6f8336ccaebaff5e94b779e279673c8bb13a7f0d427162fb7ef7
|
3 |
+
size 55400
|