tencent
/

Tencent-Hunyuan-Large

woodchen7 commited on Oct 23, 2024

Commit

5fe882b

verified ·

1 Parent(s): 9495cdc

Upload test.py with huggingface_hub

Files changed (1) hide show

test.py ADDED Viewed

+from tokenizers import ByteLevelBPETokenizer
+from transformers import AutoTokenizer
+# Step 1: Initialize ByteLevelBPETokenizer
+#tokenizer = ByteLevelBPETokenizer(
+#    "vocab.json",
+#    "merges.txt"
+#)
+# Step 2: Save the tokenizer configuration
+#tokenizer.save_model("auto_model")
+# Step 3: Load the tokenizer using AutoTokenizer
+auto_tokenizer = AutoTokenizer.from_pretrained("./", use_fast=False, trust_remote_code=True)
+# Test the tokenizer
+text = "Hello, world!"
+encoded = auto_tokenizer.encode(text)
+decoded = auto_tokenizer.decode(encoded)
+print("Encoded:", encoded)
+print("Decoded:", decoded)
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Hello, how are you?"},
+    {"role": "assistant", "content": "I'm good, thank you! How can I help you today?"},
+    {"role": "user", "content": "Nothing"},
+]
+print('messages:', messages)
+ids = auto_tokenizer.apply_chat_template(messages)
+print(f"input_ids:\t{ids}")
+text = auto_tokenizer.decode(ids)
+print(f"input_text:\t[{text}]")