Night-Quiet commited on Dec 18, 2024

Commit

5e0c4e5

1 Parent(s): 42ed155

zjt

Browse files

Files changed (38) hide show

.gitattributes +2 -0
gaudio/audio_encoder.pt +3 -0
gaudio/config.json +3 -0
gaudio/preprocessor_config.json +3 -0
gaudio/special_tokens_map.json +3 -0
gaudio/text_encoder.pt +3 -0
gaudio/tokenizer.json +3 -0
gaudio/tokenizer_config.json +3 -0
gaudio/vocab.txt +0 -0
gdino/config.json +3 -0
gdino/model.safetensors +3 -0
gdino_processor/preprocessor_config.json +3 -0
gdino_processor/special_tokens_map.json +3 -0
gdino_processor/tokenizer.json +3 -0
gdino_processor/tokenizer_config.json +3 -0
gdino_processor/vocab.txt +0 -0
gsam2_image/gsam2_image.pt +3 -0
gsam2_image/gsam2_image.yaml +123 -0
gsam2_video/gsam2_video.pt +3 -0
gsam2_video/gsam2_video.yaml +125 -0
llama/config.json +3 -0
llama/generation_config.json +3 -0
llama/model-00001-of-00004.safetensors +3 -0
llama/model-00002-of-00004.safetensors +3 -0
llama/model-00003-of-00004.safetensors +3 -0
llama/model-00004-of-00004.safetensors +3 -0
llama/model.safetensors.index.json +3 -0
sensevoice/am.mvn +8 -0
sensevoice/chn_jpn_yue_eng_ko_spectok.bpe.model +3 -0
sensevoice/sensevoice.pt +3 -0
sensevoice/sensevoice.yaml +200 -0
siglip/siglip.pt +3 -0
siglip/siglip.yaml +29 -0
sv_encoder/sv_encoder.pt +3 -0
sv_encoder/sv_encoder.yaml +33 -0
tokenizer/special_tokens_map.json +3 -0
tokenizer/tokenizer.json +3 -0
tokenizer/tokenizer_config.json +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+*.jsonl filter=lfs diff=lfs merge=lfs -text

gaudio/audio_encoder.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75759b2e87f6e8974f86d1d275c6a9d1c96f2a300b32e2c88ee5180331ff558a
+size 884939345

gaudio/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de7152eec4e7d795e9c2855fa971dcb22af49fdab01bd2177c41bb6ae5c64fdf
+size 2585

gaudio/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba6754d6c8d51e343d5cd431549cc0aad8f41a4464a23d3dcf3d299859bfe084
+size 604

gaudio/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d5b662e421ea9fac075174bb0688ee0d9431699900b90662acd44b2a350503a
+size 695

gaudio/text_encoder.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea9e501c7b83c1fc4f2d2d9ac0c089c4a64029211183a4b3d64394af35f1e7a7
+size 435643029

gaudio/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d241a60d5e8f04cc1b2b3e9ef7a4921b27bf526d9f6050ab90f9267a1f9e5c66
+size 711396

gaudio/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba7f777cdd4b947ffa1428b898637bfd00ec9c8fc09fc416645881e5b3dc323c
+size 1289

gaudio/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

gdino/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89ca35a59ea32610b61561b3dc07c785ff539634223097306b20bd3581a7ce4a
+size 1789

gdino/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f29e24728239929bfc149fb5fe238b3707fddd6048d87d8eaabb38181c79a8b
+size 933400776

gdino_processor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c17fd68afb1f124bfb87a494d409925eec4201b3487c8a694dde64d9ce7109a3
+size 511

gdino_processor/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d5b662e421ea9fac075174bb0688ee0d9431699900b90662acd44b2a350503a
+size 695

gdino_processor/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d241a60d5e8f04cc1b2b3e9ef7a4921b27bf526d9f6050ab90f9267a1f9e5c66
+size 711396

gdino_processor/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d40ab645b68211910b9170d22433d43186a6ec8ee6fd10ba170524b25bf4fb56
+size 1237

gdino_processor/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

gsam2_image/gsam2_image.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b3b1bc43ac04d206b52dc70e9032dc8ed55982d47c8ff5619234c9f349e6d6b
+size 898072910

gsam2_image/gsam2_image.yaml ADDED Viewed

	@@ -0,0 +1,123 @@

+# @package _global_
+# Model
+model:
+  _target_: modules.gsam2.sam2.sam2_base.SAM2Base
+  image_encoder:
+    _target_: modules.gsam2.sam2.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: modules.gsam2.sam2.hieradet.Hiera
+      embed_dim: 144
+      num_heads: 2
+      stages: [2, 6, 36, 4]
+      global_att_blocks: [23, 33, 43]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+      window_spec: [8, 4, 16, 8]
+    neck:
+      _target_: modules.gsam2.sam2.image_encoder.FpnNeck
+      position_encoding:
+        _target_: modules.gsam2.sam2.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [1152, 576, 288, 144]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: modules.gsam2.sam2.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: modules.gsam2.sam2.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: modules.gsam2.sam2.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: modules.gsam2.sam2.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: modules.gsam2.sam2.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: modules.gsam2.sam2.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: modules.gsam2.sam2.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: modules.gsam2.sam2.memory_encoder.Fuser
+        layer:
+          _target_: modules.gsam2.sam2.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  sam_mask_decoder_extra_args:
+    dynamic_multimask_via_stability: true
+    dynamic_multimask_stability_delta: 0.05
+    dynamic_multimask_stability_thresh: 0.98
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  no_obj_embed_spatial: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: true
+  proj_tpos_enc_in_obj_ptrs: true
+  use_signed_tpos_enc_to_obj_ptrs: true
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False

gsam2_video/gsam2_video.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b3b1bc43ac04d206b52dc70e9032dc8ed55982d47c8ff5619234c9f349e6d6b
+size 898072910

gsam2_video/gsam2_video.yaml ADDED Viewed

	@@ -0,0 +1,125 @@

+model:
+  _target_: modules.gsam2.sam2_video_predictor.SAM2VideoPredictor
+  image_encoder:
+    _target_: modules.gsam2.sam2.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: modules.gsam2.sam2.hieradet.Hiera
+      embed_dim: 144
+      num_heads: 2
+      stages: [2, 6, 36, 4]
+      global_att_blocks: [23, 33, 43]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+      window_spec: [8, 4, 16, 8]
+    neck:
+      _target_: modules.gsam2.sam2.image_encoder.FpnNeck
+      position_encoding:
+        _target_: modules.gsam2.sam2.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [1152, 576, 288, 144]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: modules.gsam2.sam2.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: modules.gsam2.sam2.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: modules.gsam2.sam2.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: modules.gsam2.sam2.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: modules.gsam2.sam2.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: modules.gsam2.sam2.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: modules.gsam2.sam2.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: modules.gsam2.sam2.memory_encoder.Fuser
+        layer:
+          _target_: modules.gsam2.sam2.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  sam_mask_decoder_extra_args:
+    dynamic_multimask_via_stability: true
+    dynamic_multimask_stability_delta: 0.05
+    dynamic_multimask_stability_thresh: 0.98
+  # the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking
+  binarize_mask_from_pts_for_mem_enc: true
+  # fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution)
+  fill_hole_area: 8
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  no_obj_embed_spatial: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: true
+  proj_tpos_enc_in_obj_ptrs: true
+  use_signed_tpos_enc_to_obj_ptrs: true
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False

llama/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f7941a3d70fc216b82de7df97863937dcd33e644416665b89dbdc67b1daa66a
+size 928

llama/generation_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d4beaeab19fc8c2d4b0725864a97f73bf787b1bd4bec222cc9e1e6a8030ca9c
+size 230

llama/model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:477ff93289f45ff953aa9f7ce6331d5fce4ee466f53865301333ea0ccece72dd
+size 4976698592

llama/model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8899cf9adf52f3fa093a6469f4efaecf66d9b62317eddde424ae8358d704a05b
+size 4999802616

llama/model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe29f6c7b95a76fafb3f9dcd614f036f41f44c19ff2e4cf58a9e2ccf442e10b1
+size 4915916080

llama/model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:013bc9e3d5d06232d8a358ad3cd323b732255d31e2debe3bf0c6325b88110ae0
+size 1168138808

llama/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:146776fce3f6db1103aa6f249e65ee5544c5923ce6f971b092eee79aa6e5d37b
+size 23950

sensevoice/am.mvn ADDED Viewed

	@@ -0,0 +1,8 @@

+<Nnet>
+<Splice> 400 400
+[ 0 ]
+<AddShift> 400 400
+<LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
+<Rescale> 400 400
+<LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
+</Nnet>

sensevoice/chn_jpn_yue_eng_ko_spectok.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa87f86064c3730d799ddf7af3c04659151102cba548bce325cf06ba4da4e6a8
+size 377341

sensevoice/sensevoice.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6280b847a0ec99377af351999a2d2223d005dd3161d4005328db35820d6f91d
+size 938091356

sensevoice/sensevoice.yaml ADDED Viewed

	@@ -0,0 +1,200 @@

+model:
+  _target_: modules.sensevoice.model.SenseVoice
+  model:
+    _target_: modules.sensevoice.sensevoicesmall.SenseVoiceSmall
+    specaug:
+      _target_: modules.sensevoice.specaug.SpecAugLFR
+      apply_freq_mask: true
+      apply_time_mask: true
+      apply_time_warp: false
+      freq_mask_width_range: [0, 30]
+      lfr_rate: 6
+      num_freq_mask: 1
+      num_time_mask: 1
+      time_mask_width_range: [0, 12]
+      time_warp_mode: bicubic
+      time_warp_window: 5
+    encoder:
+      _target_: modules.sensevoice.sensevoicesmall.SenseVoiceEncoderSmall
+      attention_dropout_rate: 0.1
+      attention_heads: 4
+      dropout_rate: 0.1
+      kernel_size: 11
+      linear_units: 2048
+      normalize_before: true
+      num_blocks: 50
+      output_size: 512
+      sanm_shfit: 0
+      tp_blocks: 20
+      input_size: 560
+    tokenizer:
+      _target_: modules.sensevoice.tokenizer.SentencepiecesTokenizer
+      bpemodel: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/chn_jpn_yue_eng_ko_spectok.bpe.model
+      unk_symbol: <unk>
+      split_with_space: true
+    frontend:
+      _target_: modules.sensevoice.frontend.WavFrontend
+      fs: 16000
+      window: hamming
+      n_mels: 80
+      frame_length: 25
+      frame_shift: 10
+      lfr_m: 7
+      lfr_n: 6
+    length_normalized_loss: true
+    input_size: 560
+    vocab_size: 25055
+    sos: 1
+    eos: 2
+    ignore_id: -1
+  vad_model:
+    _target_: modules.sensevoice.vad.FsmnVADStreaming
+    encoder:
+      _target_: modules.sensevoice.vad.FSMN
+      input_dim: 400
+      input_affine_dim: 140
+      fsmn_layers: 4
+      linear_dim: 250
+      proj_dim: 128
+      lorder: 20
+      rorder: 0
+      lstride: 1
+      rstride: 0
+      output_affine_dim: 140
+      output_dim: 248
+    frontend:
+      _target_: modules.sensevoice.frontend.WavFrontendOnline
+      fs: 16000
+      window: hamming
+      n_mels: 80
+      frame_length: 25
+      frame_shift: 10
+      dither: 0.0
+      lfr_m: 5
+      lfr_n: 1
+      cmvn_file: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/am.mvn
+    sample_rate: 16000
+    detect_mode: 1
+    snr_mode: 0
+    max_end_silence_time: 800
+    max_start_silence_time: 3000
+    do_start_point_detection: True
+    do_end_point_detection: True
+    window_size_ms: 200
+    sil_to_speech_time_thres: 150
+    speech_to_sil_time_thres: 150
+    speech_2_noise_ratio: 1.0
+    do_extend: 1
+    lookback_time_start_point: 200
+    lookahead_time_end_point: 100
+    max_single_segment_time: 60000
+    snr_thres: -100.0
+    noise_frame_num_used_for_snr: 100
+    decibel_thres: -100.0
+    speech_noise_thres: 0.6
+    fe_prior_thres: 0.0001
+    silence_pdf_num: 1
+    sil_pdf_ids: [0]
+    speech_noise_thresh_low: -0.1
+    speech_noise_thresh_high: 0.3
+    output_frame_probs: False
+    frame_in_ms: 10
+    frame_length_ms: 25
+    tokenizer: None
+    vocab_size: -1
+    input_size: 400
+  kwargs:
+    specaug:
+      apply_freq_mask: true
+      apply_time_mask: true
+      apply_time_warp: false
+      freq_mask_width_range: [0, 30]
+      lfr_rate: 6
+      num_freq_mask: 1
+      num_time_mask: 1
+      time_mask_width_range: [0, 12]
+      time_warp_mode: bicubic
+      time_warp_window: 5
+    encoder:
+      attention_dropout_rate: 0.1
+      attention_heads: 4
+      dropout_rate: 0.1
+      kernel_size: 11
+      linear_units: 2048
+      normalize_before: true
+      num_blocks: 50
+      output_size: 512
+      sanm_shfit: 0
+      tp_blocks: 20
+      input_size: 560
+    tokenizer:
+      bpemodel: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/chn_jpn_yue_eng_ko_spectok.bpe.model
+      unk_symbol: <unk>
+      split_with_space: true
+    frontend:
+      fs: 16000
+      window: hamming
+      n_mels: 80
+      frame_length: 25
+      frame_shift: 10
+      lfr_m: 7
+      lfr_n: 6
+    length_normalized_loss: true
+    input_size: 560
+    vocab_size: 25055
+    sos: 1
+    eos: 2
+    ignore_id: -1
+  vad_kwargs:
+    encoder:
+      input_dim: 400
+      input_affine_dim: 140
+      fsmn_layers: 4
+      linear_dim: 250
+      proj_dim: 128
+      lorder: 20
+      rorder: 0
+      lstride: 1
+      rstride: 0
+      output_affine_dim: 140
+      output_dim: 248
+    frontend:
+      fs: 16000
+      window: hamming
+      n_mels: 80
+      frame_length: 25
+      frame_shift: 10
+      dither: 0.0
+      lfr_m: 5
+      lfr_n: 1
+      cmvn_file: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/am.mvn
+    sample_rate: 16000
+    detect_mode: 1
+    snr_mode: 0
+    max_end_silence_time: 800
+    max_start_silence_time: 3000
+    do_start_point_detection: True
+    do_end_point_detection: True
+    window_size_ms: 200
+    sil_to_speech_time_thres: 150
+    speech_to_sil_time_thres: 150
+    speech_2_noise_ratio: 1.0
+    do_extend: 1
+    lookback_time_start_point: 200
+    lookahead_time_end_point: 100
+    max_single_segment_time: 60000
+    snr_thres: -100.0
+    noise_frame_num_used_for_snr: 100
+    decibel_thres: -100.0
+    speech_noise_thres: 0.6
+    fe_prior_thres: 0.0001
+    silence_pdf_num: 1
+    sil_pdf_ids: [0]
+    speech_noise_thresh_low: -0.1
+    speech_noise_thresh_high: 0.3
+    output_frame_probs: False
+    frame_in_ms: 10
+    frame_length_ms: 25
+    tokenizer: None
+    vocab_size: -1
+    input_size: 400

siglip/siglip.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3afbb189b33a42a2ef804a388fa88aea293741949a6cdd61035eeb094139bff1
+size 1713053358

siglip/siglip.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+model:
+  _target_: modules.siglip.ImageEncoder
+  embeddings:
+    _target_: modules.siglip.SiglipVisionEmbeddings
+    hidden_size: 1152
+    image_size: 384
+    patch_size: 14
+    num_channels: 3
+  encoder:
+    _target_: modules.siglip.SiglipEncoder
+    hidden_size: 1152
+    num_attention_heads: 16
+    attention_dropout: 0.0
+    hidden_act: gelu_pytorch_tanh
+    intermediate_size: 4304
+    layer_norm_eps: 1e-6
+    num_hidden_layers: 27
+  head:
+    _target_: modules.siglip.SiglipMultiheadAttentionPoolingHead
+    mlp:
+      _target_: modules.siglip.SiglipMLP
+      hidden_act: gelu_pytorch_tanh
+      hidden_size: 1152
+      intermediate_size: 4304
+    hidden_size: 1152
+    num_attention_heads: 16
+    layer_norm_eps: 1e-6
+  hidden_size: 1152
+  layer_norm_eps: 1e-6

sv_encoder/sv_encoder.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75759b2e87f6e8974f86d1d275c6a9d1c96f2a300b32e2c88ee5180331ff558a
+size 884939345

sv_encoder/sv_encoder.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+model:
+  _target_: modules.gaudio.sv_encoder.SenseVoiceSmall
+  specaug:
+    _target_: modules.gaudio.sv_encoder.SpecAugLFR
+    apply_freq_mask: true
+    apply_time_mask: true
+    apply_time_warp: false
+    freq_mask_width_range: [0, 30]
+    lfr_rate: 6
+    num_freq_mask: 1
+    num_time_mask: 1
+    time_mask_width_range: [0, 12]
+    time_warp_mode: bicubic
+    time_warp_window: 5
+  encoder:
+    _target_: modules.gaudio.sv_encoder.SenseVoiceEncoderSmall
+    attention_dropout_rate: 0.1
+    attention_heads: 4
+    dropout_rate: 0.1
+    kernel_size: 11
+    linear_units: 2048
+    normalize_before: true
+    num_blocks: 50
+    output_size: 512
+    sanm_shfit: 0
+    tp_blocks: 20
+    input_size: 560
+  length_normalized_loss: true
+  input_size: 560
+  vocab_size: 25055
+  sos: 1
+  eos: 2
+  ignore_id: -1

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c73565d76ce52078f48b8cd9a5ab59615c5519704c2fa6f52c5f6309915a70e7
+size 480

tokenizer/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:200a2a14d4927029e80613d8f4f770359e5910d931906c5d1d195dcdae9c6828
+size 17209899

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95faea2e162d6f8336ccaebaff5e94b779e279673c8bb13a7f0d427162fb7ef7
+size 55400