dpr-cocomae-020 / test.py
w32zhong's picture
fixes.
03f842c
import re
import os
import fire
import torch
from functools import partial
from transformers import AutoTokenizer
from transformers import AutoModelForPreTraining
from pya0.preprocess import preprocess_for_transformer
def highlight_masked(txt):
return re.sub(r"(\[MASK\])", '\033[92m' + r"\1" + '\033[0m', txt)
def classifier_hook(tokenizer, tokens, topk, module, inputs, outputs):
unmask_scores, seq_rel_scores = outputs
MSK_CODE = 103
token_ids = tokens['input_ids'][0]
masked_idx = (token_ids == torch.tensor([MSK_CODE]))
scores = unmask_scores[0][masked_idx]
cands = torch.argsort(scores, dim=1, descending=True)
for i, mask_cands in enumerate(cands):
top_cands = mask_cands[:topk].detach().cpu()
print(f'MASK[{i}] top candidates: ' +
str(tokenizer.convert_ids_to_tokens(top_cands)))
def test(tokenizer_name_or_path, model_name_or_path, test_file='test.txt'):
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
model = AutoModelForPreTraining.from_pretrained(model_name_or_path,
tie_word_embeddings=True
)
with open(test_file, 'r') as fh:
for line in fh:
# parse test file line
line = line.rstrip()
fields = line.split('\t')
maskpos = list(map(int, fields[0].split(',')))
# preprocess and mask words
sentence = preprocess_for_transformer(fields[1])
tokens = sentence.split()
for pos in filter(lambda x: x!=0, maskpos):
tokens[pos-1] = '[MASK]'
sentence = ' '.join(tokens)
sentence = sentence.replace('[mask]', '[MASK]')
tokens = tokenizer(sentence,
padding=True, truncation=True, return_tensors="pt")
#print(tokenizer.decode(tokens['input_ids'][0]))
print('*', highlight_masked(sentence))
# print unmasked
with torch.no_grad():
display = ['\n', '']
classifier = model.cls
partial_hook = partial(classifier_hook, tokenizer, tokens, 3)
hook = classifier.register_forward_hook(partial_hook)
model(**tokens)
hook.remove()
if __name__ == '__main__':
os.environ["PAGER"] = 'cat'
fire.Fire(test)