Spaces:
Build error
Build error
File size: 5,233 Bytes
e62781a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
#!/usr/bin/env python3
# Copyright 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""DrQA Document Reader predictor"""
import logging
from multiprocessing import Pool as ProcessPool
from multiprocessing.util import Finalize
from .vector import vectorize, batchify
from .model import DocReader
from . import DEFAULTS, utils
from .. import tokenizers
logger = logging.getLogger(__name__)
# ------------------------------------------------------------------------------
# Tokenize + annotate
# ------------------------------------------------------------------------------
PROCESS_TOK = None
def init(tokenizer_class, annotators):
global PROCESS_TOK
PROCESS_TOK = tokenizer_class(annotators=annotators)
Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
def tokenize(text):
global PROCESS_TOK
return PROCESS_TOK.tokenize(text)
# ------------------------------------------------------------------------------
# Predictor class.
# ------------------------------------------------------------------------------
class Predictor(object):
"""Load a pretrained DocReader model and predict inputs on the fly."""
def __init__(self, model=None, tokenizer=None, normalize=True,
embedding_file=None, num_workers=None):
"""
Args:
model: path to saved model file.
tokenizer: option string to select tokenizer class.
normalize: squash output score to 0-1 probabilities with a softmax.
embedding_file: if provided, will expand dictionary to use all
available pretrained vectors in this file.
num_workers: number of CPU processes to use to preprocess batches.
"""
logger.info('Initializing model...')
self.model = DocReader.load(model or DEFAULTS['model'],
normalize=normalize)
if embedding_file:
logger.info('Expanding dictionary...')
words = utils.index_embedding_words(embedding_file)
added = self.model.expand_dictionary(words)
self.model.load_embeddings(added, embedding_file)
logger.info('Initializing tokenizer...')
annotators = tokenizers.get_annotators_for_model(self.model)
if not tokenizer:
tokenizer_class = DEFAULTS['tokenizer']
else:
tokenizer_class = tokenizers.get_class(tokenizer)
if num_workers is None or num_workers > 0:
self.workers = ProcessPool(
num_workers,
initializer=init,
initargs=(tokenizer_class, annotators),
)
else:
self.workers = None
self.tokenizer = tokenizer_class(annotators=annotators)
def predict(self, document, question, candidates=None, top_n=1):
"""Predict a single document - question pair."""
results = self.predict_batch([(document, question, candidates,)], top_n)
return results[0]
def predict_batch(self, batch, top_n=1):
"""Predict a batch of document - question pairs."""
documents, questions, candidates = [], [], []
for b in batch:
documents.append(b[0])
questions.append(b[1])
candidates.append(b[2] if len(b) == 3 else None)
candidates = candidates if any(candidates) else None
# Tokenize the inputs, perhaps multi-processed.
if self.workers:
q_tokens = self.workers.map_async(tokenize, questions)
d_tokens = self.workers.map_async(tokenize, documents)
q_tokens = list(q_tokens.get())
d_tokens = list(d_tokens.get())
else:
q_tokens = list(map(self.tokenizer.tokenize, questions))
d_tokens = list(map(self.tokenizer.tokenize, documents))
examples = []
for i in range(len(questions)):
examples.append({
'id': i,
'question': q_tokens[i].words(),
'qlemma': q_tokens[i].lemmas(),
'document': d_tokens[i].words(),
'lemma': d_tokens[i].lemmas(),
'pos': d_tokens[i].pos(),
'ner': d_tokens[i].entities(),
})
# Stick document tokens in candidates for decoding
if candidates:
candidates = [{'input': d_tokens[i], 'cands': candidates[i]}
for i in range(len(candidates))]
# Build the batch and run it through the model
batch_exs = batchify([vectorize(e, self.model) for e in examples])
s, e, score = self.model.predict(batch_exs, candidates, top_n)
# Retrieve the predicted spans
results = []
for i in range(len(s)):
predictions = []
for j in range(len(s[i])):
span = d_tokens[i].slice(s[i][j], e[i][j] + 1).untokenize()
predictions.append((span, score[i][j].item()))
results.append(predictions)
return results
def cuda(self):
self.model.cuda()
def cpu(self):
self.model.cpu()
|