arxyzan commited on
Commit
72c619d
1 Parent(s): 8d057f4

Hezar: Upload tokenizer and config

Browse files
Files changed (1) hide show
  1. preprocessor/tokenizer_config.yaml +140 -0
preprocessor/tokenizer_config.yaml ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: whisper_bpe_tokenizer
2
+ config_type: preprocessor
3
+ pretrained_path: hezarai/whisper-small
4
+ max_length: 512
5
+ truncation_strategy: longest_first
6
+ truncation_direction: right
7
+ stride: 0
8
+ padding_strategy: longest
9
+ padding_direction: right
10
+ pad_to_multiple_of: 0
11
+ pad_token_id: 0
12
+ pad_token: <pad>
13
+ pad_token_type_id: 0
14
+ unk_token: <|endoftext|>
15
+ special_tokens:
16
+ - <|endoftext|>
17
+ - <|endoftext|>
18
+ - <|startoftranscript|>
19
+ - <|en|>
20
+ - <|zh|>
21
+ - <|de|>
22
+ - <|es|>
23
+ - <|ru|>
24
+ - <|ko|>
25
+ - <|fr|>
26
+ - <|ja|>
27
+ - <|pt|>
28
+ - <|tr|>
29
+ - <|pl|>
30
+ - <|ca|>
31
+ - <|nl|>
32
+ - <|ar|>
33
+ - <|sv|>
34
+ - <|it|>
35
+ - <|id|>
36
+ - <|hi|>
37
+ - <|fi|>
38
+ - <|vi|>
39
+ - <|he|>
40
+ - <|uk|>
41
+ - <|el|>
42
+ - <|ms|>
43
+ - <|cs|>
44
+ - <|ro|>
45
+ - <|da|>
46
+ - <|hu|>
47
+ - <|ta|>
48
+ - <|no|>
49
+ - <|th|>
50
+ - <|ur|>
51
+ - <|hr|>
52
+ - <|bg|>
53
+ - <|lt|>
54
+ - <|la|>
55
+ - <|mi|>
56
+ - <|ml|>
57
+ - <|cy|>
58
+ - <|sk|>
59
+ - <|te|>
60
+ - <|fa|>
61
+ - <|lv|>
62
+ - <|bn|>
63
+ - <|sr|>
64
+ - <|az|>
65
+ - <|sl|>
66
+ - <|kn|>
67
+ - <|et|>
68
+ - <|mk|>
69
+ - <|br|>
70
+ - <|eu|>
71
+ - <|is|>
72
+ - <|hy|>
73
+ - <|ne|>
74
+ - <|mn|>
75
+ - <|bs|>
76
+ - <|kk|>
77
+ - <|sq|>
78
+ - <|sw|>
79
+ - <|gl|>
80
+ - <|mr|>
81
+ - <|pa|>
82
+ - <|si|>
83
+ - <|km|>
84
+ - <|sn|>
85
+ - <|yo|>
86
+ - <|so|>
87
+ - <|af|>
88
+ - <|oc|>
89
+ - <|ka|>
90
+ - <|be|>
91
+ - <|tg|>
92
+ - <|sd|>
93
+ - <|gu|>
94
+ - <|am|>
95
+ - <|yi|>
96
+ - <|lo|>
97
+ - <|uz|>
98
+ - <|fo|>
99
+ - <|ht|>
100
+ - <|ps|>
101
+ - <|tk|>
102
+ - <|nn|>
103
+ - <|mt|>
104
+ - <|sa|>
105
+ - <|lb|>
106
+ - <|my|>
107
+ - <|bo|>
108
+ - <|tl|>
109
+ - <|mg|>
110
+ - <|as|>
111
+ - <|tt|>
112
+ - <|haw|>
113
+ - <|ln|>
114
+ - <|ha|>
115
+ - <|ba|>
116
+ - <|jw|>
117
+ - <|su|>
118
+ - <|translate|>
119
+ - <|transcribe|>
120
+ - <|startoflm|>
121
+ - <|startofprev|>
122
+ - <|nocaptions|>
123
+ - <|notimestamps|>
124
+ continuing_subword_prefix: ''
125
+ end_of_word_suffix: ''
126
+ fuse_unk: false
127
+ vocab_size: 50364
128
+ min_frequency: 2
129
+ limit_alphabet: 1000
130
+ initial_alphabet: []
131
+ show_progress: true
132
+ unk_token_id: 50257
133
+ bos_token: <|startoftranscript|>
134
+ bos_token_id: 50257
135
+ eos_token: <|endoftext|>
136
+ eos_token_id: 50257
137
+ add_prefix_space: false
138
+ add_bos_token: false
139
+ model_max_length: 1024
140
+ predict_timestamps: false