Text Generation
Transformers
Safetensors
llama
text-generation-inference
Inference Endpoints
mfromm commited on
Commit
8f4c21d
·
verified ·
1 Parent(s): 703d17e

Update gptx_tokenizer.py

Browse files
Files changed (1) hide show
  1. gptx_tokenizer.py +9 -24
gptx_tokenizer.py CHANGED
@@ -7,7 +7,7 @@ from pathlib import Path
7
  from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
8
 
9
  import sentencepiece as spm
10
- from huggingface_hub import hf_hub_download, list_repo_files
11
  from transformers.tokenization_utils import PreTrainedTokenizer
12
  from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
13
 
@@ -62,29 +62,14 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
62
  f"<placeholder_tok_{i}>" for i in range(256)
63
  ]
64
 
65
- def find_tokenizer_config(self, config_path: Path, repo_id: str = None) -> Path:
66
- if repo_id is None:
67
- raise ValueError("repo_id must be provided if config_path is not a local file")
68
-
69
- try:
70
- # List all files in the repo
71
- repo_files = list_repo_files(repo_id)
72
-
73
- # Find the tokenizer config file
74
- tokenizer_files = [f for f in repo_files if f.endswith('tokenizer_config.json')]
75
- if not tokenizer_files:
76
- raise FileNotFoundError(f"No tokenizer_config.json file found in repository {repo_id}")
77
-
78
- # Use the first tokenizer_config.json file found
79
- tokenizer_config_file = tokenizer_files[0]
80
- print(f"Found tokenizer config file: {tokenizer_config_file}")
81
-
82
- # Download the file
83
- tokenizer_config_file_or_name = hf_hub_download(repo_id=repo_id, filename=tokenizer_config_file)
84
- print(f"Downloaded tokenizer config file to: {tokenizer_config_file_or_name}")
85
- return tokenizer_config_file_or_name
86
- except Exception as e:
87
- raise OSError(f"Failed to download tokenizer model: {str(e)}")
88
 
89
  def instantiate_from_file_or_name(self, model_file_or_name: str, repo_id: str = None):
90
  """
 
7
  from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
8
 
9
  import sentencepiece as spm
10
+ from huggingface_hub import hf_hub_download, list_repo_files, try_to_load_from_cache
11
  from transformers.tokenization_utils import PreTrainedTokenizer
12
  from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
13
 
 
62
  f"<placeholder_tok_{i}>" for i in range(256)
63
  ]
64
 
65
+ def find_tokenizer_config(self, config_path: Path, repo_id: str = None) -> Optional[Path]:
66
+ if not os.path.isfile(config_path):
67
+ config_path = try_to_load_from_cache(repo_id=repo_id, filename=Path(config_path).name)
68
+ if not config_path:
69
+ config_path = self._download_config_from_hub(repo_id=repo_id)
70
+
71
+ return config_path
72
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  def instantiate_from_file_or_name(self, model_file_or_name: str, repo_id: str = None):
75
  """