Update gptx_tokenizer.py
Browse files- gptx_tokenizer.py +9 -24
gptx_tokenizer.py
CHANGED
@@ -7,7 +7,7 @@ from pathlib import Path
|
|
7 |
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
|
8 |
|
9 |
import sentencepiece as spm
|
10 |
-
from huggingface_hub import hf_hub_download, list_repo_files
|
11 |
from transformers.tokenization_utils import PreTrainedTokenizer
|
12 |
from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
|
13 |
|
@@ -62,29 +62,14 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
|
|
62 |
f"<placeholder_tok_{i}>" for i in range(256)
|
63 |
]
|
64 |
|
65 |
-
def find_tokenizer_config(self, config_path: Path, repo_id: str = None) -> Path:
|
66 |
-
if
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
# Find the tokenizer config file
|
74 |
-
tokenizer_files = [f for f in repo_files if f.endswith('tokenizer_config.json')]
|
75 |
-
if not tokenizer_files:
|
76 |
-
raise FileNotFoundError(f"No tokenizer_config.json file found in repository {repo_id}")
|
77 |
-
|
78 |
-
# Use the first tokenizer_config.json file found
|
79 |
-
tokenizer_config_file = tokenizer_files[0]
|
80 |
-
print(f"Found tokenizer config file: {tokenizer_config_file}")
|
81 |
-
|
82 |
-
# Download the file
|
83 |
-
tokenizer_config_file_or_name = hf_hub_download(repo_id=repo_id, filename=tokenizer_config_file)
|
84 |
-
print(f"Downloaded tokenizer config file to: {tokenizer_config_file_or_name}")
|
85 |
-
return tokenizer_config_file_or_name
|
86 |
-
except Exception as e:
|
87 |
-
raise OSError(f"Failed to download tokenizer model: {str(e)}")
|
88 |
|
89 |
def instantiate_from_file_or_name(self, model_file_or_name: str, repo_id: str = None):
|
90 |
"""
|
|
|
7 |
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
|
8 |
|
9 |
import sentencepiece as spm
|
10 |
+
from huggingface_hub import hf_hub_download, list_repo_files, try_to_load_from_cache
|
11 |
from transformers.tokenization_utils import PreTrainedTokenizer
|
12 |
from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
|
13 |
|
|
|
62 |
f"<placeholder_tok_{i}>" for i in range(256)
|
63 |
]
|
64 |
|
65 |
+
def find_tokenizer_config(self, config_path: Path, repo_id: str = None) -> Optional[Path]:
|
66 |
+
if not os.path.isfile(config_path):
|
67 |
+
config_path = try_to_load_from_cache(repo_id=repo_id, filename=Path(config_path).name)
|
68 |
+
if not config_path:
|
69 |
+
config_path = self._download_config_from_hub(repo_id=repo_id)
|
70 |
+
|
71 |
+
return config_path
|
72 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
def instantiate_from_file_or_name(self, model_file_or_name: str, repo_id: str = None):
|
75 |
"""
|