Spaces:

zhenyundeng
/

AVeriTeC-API

Build error

AVeriTeC-API / drqa /reader /predictor.py

zhenyundeng

add files

e62781a 12 months ago

5.23 kB

	#!/usr/bin/env python3
	# Copyright 2017-present, Facebook, Inc.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.
	"""DrQA Document Reader predictor"""

	import logging

	from multiprocessing import Pool as ProcessPool
	from multiprocessing.util import Finalize

	from .vector import vectorize, batchify
	from .model import DocReader
	from . import DEFAULTS, utils
	from .. import tokenizers

	logger = logging.getLogger(__name__)


	# ------------------------------------------------------------------------------
	# Tokenize + annotate
	# ------------------------------------------------------------------------------

	PROCESS_TOK = None


	def init(tokenizer_class, annotators):
	global PROCESS_TOK
	PROCESS_TOK = tokenizer_class(annotators=annotators)
	Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)


	def tokenize(text):
	global PROCESS_TOK
	return PROCESS_TOK.tokenize(text)


	# ------------------------------------------------------------------------------
	# Predictor class.
	# ------------------------------------------------------------------------------


	class Predictor(object):
	"""Load a pretrained DocReader model and predict inputs on the fly."""

	def __init__(self, model=None, tokenizer=None, normalize=True,
	embedding_file=None, num_workers=None):
	"""
	Args:
	model: path to saved model file.
	tokenizer: option string to select tokenizer class.
	normalize: squash output score to 0-1 probabilities with a softmax.
	embedding_file: if provided, will expand dictionary to use all
	available pretrained vectors in this file.
	num_workers: number of CPU processes to use to preprocess batches.
	"""
	logger.info('Initializing model...')
	self.model = DocReader.load(model or DEFAULTS['model'],
	normalize=normalize)

	if embedding_file:
	logger.info('Expanding dictionary...')
	words = utils.index_embedding_words(embedding_file)
	added = self.model.expand_dictionary(words)
	self.model.load_embeddings(added, embedding_file)

	logger.info('Initializing tokenizer...')
	annotators = tokenizers.get_annotators_for_model(self.model)
	if not tokenizer:
	tokenizer_class = DEFAULTS['tokenizer']
	else:
	tokenizer_class = tokenizers.get_class(tokenizer)

	if num_workers is None or num_workers > 0:
	self.workers = ProcessPool(
	num_workers,
	initializer=init,
	initargs=(tokenizer_class, annotators),
	)
	else:
	self.workers = None
	self.tokenizer = tokenizer_class(annotators=annotators)

	def predict(self, document, question, candidates=None, top_n=1):
	"""Predict a single document - question pair."""
	results = self.predict_batch([(document, question, candidates,)], top_n)
	return results[0]

	def predict_batch(self, batch, top_n=1):
	"""Predict a batch of document - question pairs."""
	documents, questions, candidates = [], [], []
	for b in batch:
	documents.append(b[0])
	questions.append(b[1])
	candidates.append(b[2] if len(b) == 3 else None)
	candidates = candidates if any(candidates) else None

	# Tokenize the inputs, perhaps multi-processed.
	if self.workers:
	q_tokens = self.workers.map_async(tokenize, questions)
	d_tokens = self.workers.map_async(tokenize, documents)
	q_tokens = list(q_tokens.get())
	d_tokens = list(d_tokens.get())
	else:
	q_tokens = list(map(self.tokenizer.tokenize, questions))
	d_tokens = list(map(self.tokenizer.tokenize, documents))

	examples = []
	for i in range(len(questions)):
	examples.append({
	'id': i,
	'question': q_tokens[i].words(),
	'qlemma': q_tokens[i].lemmas(),
	'document': d_tokens[i].words(),
	'lemma': d_tokens[i].lemmas(),
	'pos': d_tokens[i].pos(),
	'ner': d_tokens[i].entities(),
	})

	# Stick document tokens in candidates for decoding
	if candidates:
	candidates = [{'input': d_tokens[i], 'cands': candidates[i]}
	for i in range(len(candidates))]

	# Build the batch and run it through the model
	batch_exs = batchify([vectorize(e, self.model) for e in examples])
	s, e, score = self.model.predict(batch_exs, candidates, top_n)

	# Retrieve the predicted spans
	results = []
	for i in range(len(s)):
	predictions = []
	for j in range(len(s[i])):
	span = d_tokens[i].slice(s[i][j], e[i][j] + 1).untokenize()
	predictions.append((span, score[i][j].item()))
	results.append(predictions)
	return results

	def cuda(self):
	self.model.cuda()

	def cpu(self):
	self.model.cpu()