Aananda-giri commited on
Commit
9a7d186
·
verified ·
1 Parent(s): ea7d50d

Upload tokenizer

Browse files
Files changed (3) hide show
  1. README.md +3 -0
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +14 -13
README.md CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  # BPE Tokenizer for Nepali LLM
2
 
3
  This repository contains a Byte Pair Encoding (BPE) tokenizer trained using the Hugging Face `transformers` package on the Nepali LLM dataset. The tokenizer has been optimized for handling Nepali text and is intended for use in language modeling and other natural language processing tasks.
 
1
+ ---
2
+ {}
3
+ ---
4
  # BPE Tokenizer for Nepali LLM
5
 
6
  This repository contains a Byte Pair Encoding (BPE) tokenizer trained using the Hugging Face `transformers` package on the Nepali LLM dataset. The tokenizer has been optimized for handling Nepali text and is intended for use in language modeling and other natural language processing tasks.
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,47 +1,47 @@
1
  {
2
  "added_tokens_decoder": {
3
- "0": {
4
- "content": "<|endoftext|>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
8
  "single_word": false,
9
  "special": true
10
  },
11
- "1": {
12
- "content": "<|unk|>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
  "special": true
18
  },
19
- "2": {
20
- "content": "<|sep|>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
26
  },
27
- "3": {
28
- "content": "<|pad|>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
32
  "single_word": false,
33
  "special": true
34
  },
35
- "4": {
36
- "content": "<|mask|>",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  },
43
- "5": {
44
- "content": "<|newline|>",
45
  "lstrip": false,
46
  "normalized": false,
47
  "rstrip": false,
@@ -49,7 +49,8 @@
49
  "special": true
50
  }
51
  },
52
- "clean_up_tokenization_spaces": true,
 
53
  "model_max_length": 1000000000000000019884624838656,
54
  "tokenizer_class": "PreTrainedTokenizerFast"
55
  }
 
1
  {
2
  "added_tokens_decoder": {
3
+ "50000": {
4
+ "content": "<|begin_of_text|>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
8
  "single_word": false,
9
  "special": true
10
  },
11
+ "50001": {
12
+ "content": "<|end_of_text|>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
  "special": true
18
  },
19
+ "50002": {
20
+ "content": "<|start_header_id|>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "50003": {
28
+ "content": "<|end_header_id|>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
32
  "single_word": false,
33
  "special": true
34
  },
35
+ "50004": {
36
+ "content": "<|eot_id|>",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  },
43
+ "50005": {
44
+ "content": "\n\n",
45
  "lstrip": false,
46
  "normalized": false,
47
  "rstrip": false,
 
49
  "special": true
50
  }
51
  },
52
+ "clean_up_tokenization_spaces": false,
53
+ "extra_special_tokens": {},
54
  "model_max_length": 1000000000000000019884624838656,
55
  "tokenizer_class": "PreTrainedTokenizerFast"
56
  }