Upload tokenizer

Files changed (3) hide show

README.md CHANGED Viewed

@@ -1,3 +1,6 @@
 # BPE Tokenizer for Nepali LLM
 This repository contains a Byte Pair Encoding (BPE) tokenizer trained using the Hugging Face `transformers` package on the Nepali LLM dataset. The tokenizer has been optimized for handling Nepali text and is intended for use in language modeling and other natural language processing tasks.

+---
+{}
+---
 # BPE Tokenizer for Nepali LLM
 This repository contains a Byte Pair Encoding (BPE) tokenizer trained using the Hugging Face `transformers` package on the Nepali LLM dataset. The tokenizer has been optimized for handling Nepali text and is intended for use in language modeling and other natural language processing tasks.

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -1,47 +1,47 @@
 {
   "added_tokens_decoder": {
-    "0": {
-      "content": "<|endoftext|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "1": {
-      "content": "<|unk|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "2": {
-      "content": "<|sep|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "3": {
-      "content": "<|pad|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "4": {
-      "content": "<|mask|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "5": {
-      "content": "<|newline|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
@@ -49,7 +49,8 @@
       "special": true
     }
   },
-  "clean_up_tokenization_spaces": true,
   "model_max_length": 1000000000000000019884624838656,
   "tokenizer_class": "PreTrainedTokenizerFast"
 }

 {
   "added_tokens_decoder": {
+    "50000": {
+      "content": "<|begin_of_text|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "50001": {
+      "content": "<|end_of_text|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "50002": {
+      "content": "<|start_header_id|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "50003": {
+      "content": "<|end_header_id|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "50004": {
+      "content": "<|eot_id|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "50005": {
+      "content": "\n\n",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "special": true
     }
   },
+  "clean_up_tokenization_spaces": false,
+  "extra_special_tokens": {},
   "model_max_length": 1000000000000000019884624838656,
   "tokenizer_class": "PreTrainedTokenizerFast"
 }