File size: 4,640 Bytes
22dd552 e2881aa 22dd552 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
import json
import torch
from transformers import (
AutoTokenizer,
PreTrainedTokenizer
)
from .configuration_keeper import KeeperConfig
from typing import Optional, List, Union
class KeeperTokenizer(PreTrainedTokenizer):
config_class = KeeperConfig
def __init__(self, cfg=None):
self.tokenizer_retriever = None
self.tokenizer_model = None
if cfg:
print("Initializing KeeperTokenizer with cfg")
# Inicialización con configuración
self.tokenizer_retriever = AutoTokenizer.from_pretrained(cfg.retriever_config['_name_or_path'])
self.tokenizer_model = AutoTokenizer.from_pretrained(cfg.model_config['_name_or_path'])
# Almacena kwargs para la serialización y carga futura
self.init_kwargs = {'cfg': cfg}
super().__init__() # Inicializa la clase base al principio
print("Initialization complete")
else:
# Si cfg no se proporciona, esto se manejará en el método from_pretrained
print("Initializing KeeperTokenizer without cfg")
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
# Crea una nueva instancia de KeeperTokenizer sin cfg
instance = cls()
print("Loading tokenizer_retriever from", pretrained_model_name_or_path)
instance.tokenizer_retriever = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path, subfolder='tokenizer-retriever'
)
print("Loading tokenizer_model from", pretrained_model_name_or_path)
instance.tokenizer_model = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path, subfolder='tokenizer-model'
)
return instance
@property
def vocab_size(self):
# Obtiene los vocabularios de ambos tokenizadores
vocab_retriever = self.tokenizer_retriever.get_vocab()
vocab_model = self.tokenizer_model.get_vocab()
# Combina los vocabularios
combined_vocab = {**vocab_retriever, **vocab_model}
# Devuelve el tamaño del vocabulario combinado
return len(combined_vocab)
def get_vocab(self):
# Obtiene los vocabularios de ambos tokenizadores
vocab_retriever = self.tokenizer_retriever.get_vocab()
vocab_model = self.tokenizer_model.get_vocab()
# Organiza los vocabularios en un diccionario con claves separadas
separated_vocabularies = {
'vocab_retriever': vocab_retriever,
'vocab_model': vocab_model
}
return separated_vocabularies
def _tokenize(self, text, **kwargs):
# You must implement this method for your tokenization logic
pass
def encode(self, text, **kwargs):
tokens_retriever = self.tokenizer_retriever(text, return_tensors='pt', **kwargs)
tokens_model = self.tokenizer_model(text, return_tensors='pt', **kwargs)
return {
'tokens_retriever': tokens_retriever,
'tokens_model': tokens_model
}
def decode(
self,
token_ids: Union[int, List[int], "torch.Tensor"],
skip_special_tokens: bool = False,
**kwargs,
) -> str:
return self.tokenizer_model.decode(token_ids, skip_special_tokens, **kwargs)
def save_vocabulary(self, save_directory, filename_prefix=None):
# Asegúrate de que el directorio de salida existe
os.makedirs(save_directory, exist_ok=True)
# Guarda el tokenizador retriever
retriever_save_directory = os.path.join(save_directory, "tokenizer-retriever")
os.makedirs(retriever_save_directory, exist_ok=True)
self.tokenizer_retriever.save_pretrained(retriever_save_directory)
# Guarda el tokenizador model
model_save_directory = os.path.join(save_directory, "tokenizer-model")
os.makedirs(model_save_directory, exist_ok=True)
self.tokenizer_model.save_pretrained(model_save_directory)
# Devuelve los nombres de los archivos guardados (opcional)
saved_files = [
"tokenizer-retriver/tokenizer_config.json",
"tokenizer-retriver/special_tokens_map.json",
"tokenizer-retriver/vocab.json",
"tokenizer-retriver/added_tokens.json",
"tokenizer-model/tokenizer_config.json",
"tokenizer-model/special_tokens_map.json",
"tokenizer-model/vocab.json",
"tokenizer-model/added_tokens.json"
]
return tuple(os.path.join(save_directory, file) for file in saved_files)
|