File size: 5,309 Bytes
2d8da09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This is the script to exam the KNN mapping quality between indexed data and indexed retrieval database.
It requires the training text data to be converted into `bin` and `idx` files by `preprocess_data_for_megatron.py` script.
It also requires KNNIndex built by `build_retrieval_index.py` script.
Here is an example to using it:
```python
python scripts/nlp_language_modeling/exam_knn_map_quality.py \
--input_data_prefix=PATH_TO_DATA \
--input_retrieval_prefix=PATH_TO_RETRIEVAL_DATA \
--knn_index=PATH_TO_KNN_MAP_INDEX \
--chunk_ids 2 3000 4000 5000 6000 \
--tokenizer-library=sentencepiece \
--tokenizer-model=tokenizer.model
```
"""
import argparse
from nemo.collections.nlp.data.language_modeling.megatron.indexed_retrieval_dataset import (
KNNIndex,
MMapRetrievalIndexedDataset,
)
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.utils import logging
def get_tokenizer(args):
tokenizer = get_nmt_tokenizer(
library=args.tokenizer_library,
model_name=args.tokenizer_type,
tokenizer_model=args.tokenizer_model,
vocab_file=args.vocab_file,
merges_file=args.merge_file,
delimiter=args.delimiter,
)
if not hasattr(tokenizer, "pad_id"):
tokenizer.add_special_tokens({'pad_token': '<pad>'})
elif hasattr(tokenizer, "pad_id") and (tokenizer.pad_id is None or tokenizer.pad_id < 0):
tokenizer.add_special_tokens({'pad_token': '<pad>'})
return tokenizer
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="build Faiss index",)
parser.add_argument(
'--input_data_prefix', type=str, required=True, help='Input data prefix',
)
parser.add_argument(
'--input_retrieval_prefix', type=str, required=True, help='Input retrieval data prefix',
)
parser.add_argument(
'--knn_index', type=str, required=True, help='Input knn map index file',
)
parser.add_argument(
'--neighbors', type=int, default=None, help='number of neighbors',
)
parser.add_argument(
'--chunk_ids',
nargs='+',
default=[1, 3, 5, 7],
type=int,
help='space separate listed of chunk ids in input data',
)
group = parser.add_argument_group(title='tokenizer')
group.add_argument(
'--tokenizer-library',
type=str,
required=True,
choices=['yttm', 'sentencepiece', 'megatron', 'huggingface', 'tabular'],
help='What tokenizer library to use.',
)
group.add_argument(
'--tokenizer-type', type=str, default=None, help='What type of tokenizer to use.',
)
group.add_argument(
'--tokenizer-model', type=str, default=None, help='Path to tokenizer model.',
)
group.add_argument('--vocab-file', type=str, default=None, help='Path to the vocab file')
group.add_argument('--merge-file', type=str, default=None, help='Path to the BPE merge file (if necessary).')
group.add_argument('--delimiter', type=str, default=None, help='delimiter used for tabular tokenizer')
args = parser.parse_args()
tokenizer = get_tokenizer(args)
data_ds = MMapRetrievalIndexedDataset(args.input_data_prefix)
retrieval_ds = MMapRetrievalIndexedDataset(args.input_retrieval_prefix)
knn_index = KNNIndex(args.knn_index)
assert knn_index.len == data_ds.chunks
logging.info(f'Data index has {data_ds.chunks} chunks')
logging.info(f'Retrieval Data index has {retrieval_ds.chunks} chunks')
logging.info(f'KNN index has {knn_index.K} neighbors')
assert data_ds._index.chunk_size == retrieval_ds._index.chunk_size
print_num_neighbors = knn_index.K
if args.neighbors is not None:
assert args.neighbors <= knn_index.K
print_num_neighbors = args.neighbors
for chunk_id in args.chunk_ids:
token_ids = data_ds.get_chunk(chunk_id, force_no_cont_ids=True)
assert token_ids.shape[0] == data_ds._index.chunk_size
query_text = tokenizer.ids_to_text(token_ids)
neighbor_chunk_ids = knn_index.get_KNN_chunk_ids(chunk_id)
neighbor_chunk_ids = neighbor_chunk_ids[:print_num_neighbors]
print(f'Query: {query_text}')
for i, neighbor in enumerate(neighbor_chunk_ids):
token_ids = retrieval_ds.get_chunk(neighbor)
half = token_ids.shape[0] // 2
assert half == data_ds._index.chunk_size
neighbor_match = tokenizer.ids_to_text(token_ids[:half])
neighbor_extend = tokenizer.ids_to_text(token_ids[half:])
print(f' ->K{i}: {neighbor_match} --- {neighbor_extend}')
print(' --------------- ')
|