File size: 5,309 Bytes
2d8da09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This is the script to exam the KNN mapping quality between indexed data and indexed retrieval database.

It requires the training text data to be converted into `bin` and `idx` files by `preprocess_data_for_megatron.py` script.
It also requires KNNIndex built by `build_retrieval_index.py` script.

Here is an example to using it:

```python
python scripts/nlp_language_modeling/exam_knn_map_quality.py \
    --input_data_prefix=PATH_TO_DATA \
    --input_retrieval_prefix=PATH_TO_RETRIEVAL_DATA \
    --knn_index=PATH_TO_KNN_MAP_INDEX \
    --chunk_ids 2 3000 4000 5000 6000 \
    --tokenizer-library=sentencepiece \
    --tokenizer-model=tokenizer.model
```

"""
import argparse

from nemo.collections.nlp.data.language_modeling.megatron.indexed_retrieval_dataset import (
    KNNIndex,
    MMapRetrievalIndexedDataset,
)
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.utils import logging


def get_tokenizer(args):
    tokenizer = get_nmt_tokenizer(
        library=args.tokenizer_library,
        model_name=args.tokenizer_type,
        tokenizer_model=args.tokenizer_model,
        vocab_file=args.vocab_file,
        merges_file=args.merge_file,
        delimiter=args.delimiter,
    )
    if not hasattr(tokenizer, "pad_id"):
        tokenizer.add_special_tokens({'pad_token': '<pad>'})
    elif hasattr(tokenizer, "pad_id") and (tokenizer.pad_id is None or tokenizer.pad_id < 0):
        tokenizer.add_special_tokens({'pad_token': '<pad>'})
    return tokenizer


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="build Faiss index",)
    parser.add_argument(
        '--input_data_prefix', type=str, required=True, help='Input data prefix',
    )
    parser.add_argument(
        '--input_retrieval_prefix', type=str, required=True, help='Input retrieval data prefix',
    )
    parser.add_argument(
        '--knn_index', type=str, required=True, help='Input knn map index file',
    )
    parser.add_argument(
        '--neighbors', type=int, default=None, help='number of neighbors',
    )
    parser.add_argument(
        '--chunk_ids',
        nargs='+',
        default=[1, 3, 5, 7],
        type=int,
        help='space separate listed of chunk ids in input data',
    )
    group = parser.add_argument_group(title='tokenizer')
    group.add_argument(
        '--tokenizer-library',
        type=str,
        required=True,
        choices=['yttm', 'sentencepiece', 'megatron', 'huggingface', 'tabular'],
        help='What tokenizer library to use.',
    )
    group.add_argument(
        '--tokenizer-type', type=str, default=None, help='What type of tokenizer to use.',
    )
    group.add_argument(
        '--tokenizer-model', type=str, default=None, help='Path to tokenizer model.',
    )
    group.add_argument('--vocab-file', type=str, default=None, help='Path to the vocab file')
    group.add_argument('--merge-file', type=str, default=None, help='Path to the BPE merge file (if necessary).')
    group.add_argument('--delimiter', type=str, default=None, help='delimiter used for tabular tokenizer')

    args = parser.parse_args()

    tokenizer = get_tokenizer(args)
    data_ds = MMapRetrievalIndexedDataset(args.input_data_prefix)
    retrieval_ds = MMapRetrievalIndexedDataset(args.input_retrieval_prefix)
    knn_index = KNNIndex(args.knn_index)
    assert knn_index.len == data_ds.chunks
    logging.info(f'Data index has {data_ds.chunks} chunks')
    logging.info(f'Retrieval Data index has {retrieval_ds.chunks} chunks')
    logging.info(f'KNN index has {knn_index.K} neighbors')
    assert data_ds._index.chunk_size == retrieval_ds._index.chunk_size
    print_num_neighbors = knn_index.K
    if args.neighbors is not None:
        assert args.neighbors <= knn_index.K
        print_num_neighbors = args.neighbors

    for chunk_id in args.chunk_ids:
        token_ids = data_ds.get_chunk(chunk_id, force_no_cont_ids=True)
        assert token_ids.shape[0] == data_ds._index.chunk_size
        query_text = tokenizer.ids_to_text(token_ids)
        neighbor_chunk_ids = knn_index.get_KNN_chunk_ids(chunk_id)
        neighbor_chunk_ids = neighbor_chunk_ids[:print_num_neighbors]
        print(f'Query: {query_text}')
        for i, neighbor in enumerate(neighbor_chunk_ids):
            token_ids = retrieval_ds.get_chunk(neighbor)
            half = token_ids.shape[0] // 2
            assert half == data_ds._index.chunk_size
            neighbor_match = tokenizer.ids_to_text(token_ids[:half])
            neighbor_extend = tokenizer.ids_to_text(token_ids[half:])
            print(f' ->K{i}: {neighbor_match} --- {neighbor_extend}')
        print('         ---------------         ')