kingabzpro commited on
Commit
aa94c13
1 Parent(s): 3bfb595

add LM preprocessing

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<s>": 30, "</s>": 31}
alphabet.json DELETED
@@ -1 +0,0 @@
1
- {"labels": [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\u00e9", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
 
 
config.json CHANGED
@@ -8,7 +8,7 @@
8
  "architectures": [
9
  "Wav2Vec2ForCTC"
10
  ],
11
- "attention_dropout": 0.1,
12
  "bos_token_id": 1,
13
  "classifier_proj_size": 256,
14
  "codevector_dim": 768,
@@ -42,30 +42,30 @@
42
  2
43
  ],
44
  "ctc_loss_reduction": "mean",
45
- "ctc_zero_infinity": false,
46
  "diversity_loss_weight": 0.1,
47
  "do_stable_layer_norm": true,
48
  "eos_token_id": 2,
49
  "feat_extract_activation": "gelu",
50
  "feat_extract_dropout": 0.0,
51
  "feat_extract_norm": "layer",
52
- "feat_proj_dropout": 0.0,
53
  "feat_quantizer_dropout": 0.0,
54
  "final_dropout": 0.0,
55
  "gradient_checkpointing": false,
56
  "hidden_act": "gelu",
57
- "hidden_dropout": 0.1,
58
  "hidden_size": 1024,
59
  "initializer_range": 0.02,
60
  "intermediate_size": 4096,
61
  "layer_norm_eps": 1e-05,
62
- "layerdrop": 0.1,
63
  "mask_feature_length": 10,
64
  "mask_feature_min_masks": 0,
65
  "mask_feature_prob": 0.0,
66
  "mask_time_length": 10,
67
  "mask_time_min_masks": 2,
68
- "mask_time_prob": 0.05,
69
  "model_type": "wav2vec2",
70
  "num_adapter_layers": 3,
71
  "num_attention_heads": 16,
@@ -77,7 +77,7 @@
77
  "num_hidden_layers": 24,
78
  "num_negatives": 100,
79
  "output_hidden_size": 1024,
80
- "pad_token_id": 35,
81
  "proj_codevector_dim": 768,
82
  "tdnn_dilation": [
83
  1,
@@ -103,6 +103,6 @@
103
  "torch_dtype": "float32",
104
  "transformers_version": "4.17.0.dev0",
105
  "use_weighted_layer_sum": false,
106
- "vocab_size": 38,
107
  "xvector_output_dim": 512
108
  }
 
8
  "architectures": [
9
  "Wav2Vec2ForCTC"
10
  ],
11
+ "attention_dropout": 0.094,
12
  "bos_token_id": 1,
13
  "classifier_proj_size": 256,
14
  "codevector_dim": 768,
 
42
  2
43
  ],
44
  "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": true,
46
  "diversity_loss_weight": 0.1,
47
  "do_stable_layer_norm": true,
48
  "eos_token_id": 2,
49
  "feat_extract_activation": "gelu",
50
  "feat_extract_dropout": 0.0,
51
  "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.4,
53
  "feat_quantizer_dropout": 0.0,
54
  "final_dropout": 0.0,
55
  "gradient_checkpointing": false,
56
  "hidden_act": "gelu",
57
+ "hidden_dropout": 0.047,
58
  "hidden_size": 1024,
59
  "initializer_range": 0.02,
60
  "intermediate_size": 4096,
61
  "layer_norm_eps": 1e-05,
62
+ "layerdrop": 0.041,
63
  "mask_feature_length": 10,
64
  "mask_feature_min_masks": 0,
65
  "mask_feature_prob": 0.0,
66
  "mask_time_length": 10,
67
  "mask_time_min_masks": 2,
68
+ "mask_time_prob": 0.4,
69
  "model_type": "wav2vec2",
70
  "num_adapter_layers": 3,
71
  "num_attention_heads": 16,
 
77
  "num_hidden_layers": 24,
78
  "num_negatives": 100,
79
  "output_hidden_size": 1024,
80
+ "pad_token_id": 29,
81
  "proj_codevector_dim": 768,
82
  "tdnn_dilation": [
83
  1,
 
103
  "torch_dtype": "float32",
104
  "transformers_version": "4.17.0.dev0",
105
  "use_weighted_layer_sum": false,
106
+ "vocab_size": 32,
107
  "xvector_output_dim": 512
108
  }
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": "wav2vec2-xls-r-300m-indonesian/special_tokens_map.json", "tokenizer_file": null, "name_or_path": "wav2vec2-xls-r-300m-indonesian", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
vocab.json CHANGED
@@ -1 +1 @@
1
- {"(": 1, "[": 2, "a": 3, "b": 4, "c": 5, "d": 6, "e": 7, "f": 8, "g": 9, "h": 10, "i": 11, "j": 12, "k": 13, "l": 14, "m": 15, "n": 16, "o": 17, "p": 18, "q": 19, "r": 20, "s": 21, "t": 22, "u": 23, "v": 24, "w": 25, "x": 26, "y": 27, "z": 28, "á": 29, "é": 30, "ł": 31, "ń": 32, "–": 33, "|": 0, "<unk>": 34, "<pad>": 35, "<s>": 36, "</s>": 37}
 
1
+ {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9, "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19, "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26, "é": 27, "|": 0, "[UNK]": 28, "[PAD]": 29}