|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
This is the script to exam the KNN mapping quality between indexed data and indexed retrieval database. |
|
|
|
It requires the training text data to be converted into `bin` and `idx` files by `preprocess_data_for_megatron.py` script. |
|
It also requires KNNIndex built by `build_retrieval_index.py` script. |
|
|
|
Here is an example to using it: |
|
|
|
```python |
|
python scripts/nlp_language_modeling/exam_knn_map_quality.py \ |
|
--input_data_prefix=PATH_TO_DATA \ |
|
--input_retrieval_prefix=PATH_TO_RETRIEVAL_DATA \ |
|
--knn_index=PATH_TO_KNN_MAP_INDEX \ |
|
--chunk_ids 2 3000 4000 5000 6000 \ |
|
--tokenizer-library=sentencepiece \ |
|
--tokenizer-model=tokenizer.model |
|
``` |
|
|
|
""" |
|
import argparse |
|
|
|
from nemo.collections.nlp.data.language_modeling.megatron.indexed_retrieval_dataset import ( |
|
KNNIndex, |
|
MMapRetrievalIndexedDataset, |
|
) |
|
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer |
|
from nemo.utils import logging |
|
|
|
|
|
def get_tokenizer(args): |
|
tokenizer = get_nmt_tokenizer( |
|
library=args.tokenizer_library, |
|
model_name=args.tokenizer_type, |
|
tokenizer_model=args.tokenizer_model, |
|
vocab_file=args.vocab_file, |
|
merges_file=args.merge_file, |
|
delimiter=args.delimiter, |
|
) |
|
if not hasattr(tokenizer, "pad_id"): |
|
tokenizer.add_special_tokens({'pad_token': '<pad>'}) |
|
elif hasattr(tokenizer, "pad_id") and (tokenizer.pad_id is None or tokenizer.pad_id < 0): |
|
tokenizer.add_special_tokens({'pad_token': '<pad>'}) |
|
return tokenizer |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser(description="build Faiss index",) |
|
parser.add_argument( |
|
'--input_data_prefix', type=str, required=True, help='Input data prefix', |
|
) |
|
parser.add_argument( |
|
'--input_retrieval_prefix', type=str, required=True, help='Input retrieval data prefix', |
|
) |
|
parser.add_argument( |
|
'--knn_index', type=str, required=True, help='Input knn map index file', |
|
) |
|
parser.add_argument( |
|
'--neighbors', type=int, default=None, help='number of neighbors', |
|
) |
|
parser.add_argument( |
|
'--chunk_ids', |
|
nargs='+', |
|
default=[1, 3, 5, 7], |
|
type=int, |
|
help='space separate listed of chunk ids in input data', |
|
) |
|
group = parser.add_argument_group(title='tokenizer') |
|
group.add_argument( |
|
'--tokenizer-library', |
|
type=str, |
|
required=True, |
|
choices=['yttm', 'sentencepiece', 'megatron', 'huggingface', 'tabular'], |
|
help='What tokenizer library to use.', |
|
) |
|
group.add_argument( |
|
'--tokenizer-type', type=str, default=None, help='What type of tokenizer to use.', |
|
) |
|
group.add_argument( |
|
'--tokenizer-model', type=str, default=None, help='Path to tokenizer model.', |
|
) |
|
group.add_argument('--vocab-file', type=str, default=None, help='Path to the vocab file') |
|
group.add_argument('--merge-file', type=str, default=None, help='Path to the BPE merge file (if necessary).') |
|
group.add_argument('--delimiter', type=str, default=None, help='delimiter used for tabular tokenizer') |
|
|
|
args = parser.parse_args() |
|
|
|
tokenizer = get_tokenizer(args) |
|
data_ds = MMapRetrievalIndexedDataset(args.input_data_prefix) |
|
retrieval_ds = MMapRetrievalIndexedDataset(args.input_retrieval_prefix) |
|
knn_index = KNNIndex(args.knn_index) |
|
assert knn_index.len == data_ds.chunks |
|
logging.info(f'Data index has {data_ds.chunks} chunks') |
|
logging.info(f'Retrieval Data index has {retrieval_ds.chunks} chunks') |
|
logging.info(f'KNN index has {knn_index.K} neighbors') |
|
assert data_ds._index.chunk_size == retrieval_ds._index.chunk_size |
|
print_num_neighbors = knn_index.K |
|
if args.neighbors is not None: |
|
assert args.neighbors <= knn_index.K |
|
print_num_neighbors = args.neighbors |
|
|
|
for chunk_id in args.chunk_ids: |
|
token_ids = data_ds.get_chunk(chunk_id, force_no_cont_ids=True) |
|
assert token_ids.shape[0] == data_ds._index.chunk_size |
|
query_text = tokenizer.ids_to_text(token_ids) |
|
neighbor_chunk_ids = knn_index.get_KNN_chunk_ids(chunk_id) |
|
neighbor_chunk_ids = neighbor_chunk_ids[:print_num_neighbors] |
|
print(f'Query: {query_text}') |
|
for i, neighbor in enumerate(neighbor_chunk_ids): |
|
token_ids = retrieval_ds.get_chunk(neighbor) |
|
half = token_ids.shape[0] // 2 |
|
assert half == data_ds._index.chunk_size |
|
neighbor_match = tokenizer.ids_to_text(token_ids[:half]) |
|
neighbor_extend = tokenizer.ids_to_text(token_ids[half:]) |
|
print(f' ->K{i}: {neighbor_match} --- {neighbor_extend}') |
|
print(' --------------- ') |
|
|