Text Generation
Safetensors
Spanish
Paraguay
Culture
Custom Code
Guaraní
unsloth
gua-a / tokenizer_keeper.py
enpaiva's picture
Update tokenizer_keeper.py
008b9db verified
raw
history blame
4.66 kB
import os
import json
import torch
from transformers import (
AutoTokenizer,
PreTrainedTokenizer
)
from .configuration_keeper import KeeperConfig
from typing import Optional, List, Union
class KeeperTokenizer(PreTrainedTokenizer):
config_class = KeeperConfig
def __init__(self, cfg=None):
self.tokenizer_retriever = None
self.tokenizer_model = None
# if cfg:
# print("Initializing KeeperTokenizer with cfg")
# # Inicialización con configuración
# self.tokenizer_retriever = AutoTokenizer.from_pretrained(cfg.retriever_config['_name_or_path'])
# self.tokenizer_model = AutoTokenizer.from_pretrained(cfg.model_config['_name_or_path'])
# # Almacena kwargs para la serialización y carga futura
# self.init_kwargs = {'cfg': cfg}
# super().__init__() # Inicializa la clase base al principio
# print("Initialization complete")
# else:
# # Si cfg no se proporciona, esto se manejará en el método from_pretrained
# print("Initializing KeeperTokenizer without cfg")
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
# Crea una nueva instancia de KeeperTokenizer sin cfg
instance = cls()
print("Loading tokenizer_retriever from", pretrained_model_name_or_path)
instance.tokenizer_retriever = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path, subfolder='tokenizer-retriever'
)
print("Loading tokenizer_model from", pretrained_model_name_or_path)
instance.tokenizer_model = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path, subfolder='tokenizer-model'
)
return instance
@property
def vocab_size(self):
# Obtiene los vocabularios de ambos tokenizadores
vocab_retriever = self.tokenizer_retriever.get_vocab()
vocab_model = self.tokenizer_model.get_vocab()
# Combina los vocabularios
combined_vocab = {**vocab_retriever, **vocab_model}
# Devuelve el tamaño del vocabulario combinado
return len(combined_vocab)
def get_vocab(self):
# Obtiene los vocabularios de ambos tokenizadores
vocab_retriever = self.tokenizer_retriever.get_vocab()
vocab_model = self.tokenizer_model.get_vocab()
# Organiza los vocabularios en un diccionario con claves separadas
separated_vocabularies = {
'vocab_retriever': vocab_retriever,
'vocab_model': vocab_model
}
return separated_vocabularies
def _tokenize(self, text, **kwargs):
# You must implement this method for your tokenization logic
pass
def encode(self, text, **kwargs):
tokens_retriever = self.tokenizer_retriever(text, return_tensors='pt', **kwargs)
tokens_model = self.tokenizer_model(text, return_tensors='pt', **kwargs)
return {
'tokens_retriever': tokens_retriever,
'tokens_model': tokens_model
}
def decode(
self,
token_ids: Union[int, List[int], "torch.Tensor"],
skip_special_tokens: bool = False,
**kwargs,
) -> str:
return self.tokenizer_model.decode(token_ids, skip_special_tokens, **kwargs)
def save_vocabulary(self, save_directory, filename_prefix=None):
# Asegúrate de que el directorio de salida existe
os.makedirs(save_directory, exist_ok=True)
# Guarda el tokenizador retriever
retriever_save_directory = os.path.join(save_directory, "tokenizer-retriever")
os.makedirs(retriever_save_directory, exist_ok=True)
self.tokenizer_retriever.save_pretrained(retriever_save_directory)
# Guarda el tokenizador model
model_save_directory = os.path.join(save_directory, "tokenizer-model")
os.makedirs(model_save_directory, exist_ok=True)
self.tokenizer_model.save_pretrained(model_save_directory)
# Devuelve los nombres de los archivos guardados (opcional)
saved_files = [
"tokenizer-retriver/tokenizer_config.json",
"tokenizer-retriver/special_tokens_map.json",
"tokenizer-retriver/vocab.json",
"tokenizer-retriver/added_tokens.json",
"tokenizer-model/tokenizer_config.json",
"tokenizer-model/special_tokens_map.json",
"tokenizer-model/vocab.json",
"tokenizer-model/added_tokens.json"
]
return tuple(os.path.join(save_directory, file) for file in saved_files)