Pubmed_Analyzer / questiongenerator.py
Qiwei97's picture
Update questiongenerator.py
56fed24
# Adapted from https://github.com/AMontgomerie/question_generator
import os
import sys
import math
import numpy as np
import torch
import spacy
import re
import random
import json
import en_core_web_sm
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
AutoModelForSequenceClassification,
)
class QuestionGenerator:
def __init__(self, model_dir=None):
QG_PRETRAINED = "iarfmoose/t5-base-question-generator"
self.ANSWER_TOKEN = "<answer>"
self.CONTEXT_TOKEN = "<context>"
self.SEQ_LENGTH = 512
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.qg_tokenizer = AutoTokenizer.from_pretrained(QG_PRETRAINED, use_fast=False)
self.qg_model = AutoModelForSeq2SeqLM.from_pretrained(QG_PRETRAINED)
self.qg_model.to(self.device)
self.qa_evaluator = QAEvaluator(model_dir)
def generate(
self, article, use_evaluator=True, num_questions=None, answer_style="all"
):
print("Generating questions...\n")
qg_inputs, qg_answers = self.generate_qg_inputs(article, answer_style)
generated_questions = self.generate_questions_from_inputs(qg_inputs)
message = "{} questions doesn't match {} answers".format(
len(generated_questions), len(qg_answers)
)
assert len(generated_questions) == len(qg_answers), message
if use_evaluator:
print("Evaluating QA pairs...\n")
encoded_qa_pairs = self.qa_evaluator.encode_qa_pairs(
generated_questions, qg_answers
)
scores = self.qa_evaluator.get_scores(encoded_qa_pairs)
if num_questions:
qa_list = self._get_ranked_qa_pairs(
generated_questions, qg_answers, scores, num_questions
)
else:
qa_list = self._get_ranked_qa_pairs(
generated_questions, qg_answers, scores
)
else:
print("Skipping evaluation step.\n")
qa_list = self._get_all_qa_pairs(generated_questions, qg_answers)
return qa_list
def generate_qg_inputs(self, text, answer_style):
VALID_ANSWER_STYLES = ["all", "sentences", "multiple_choice"]
if answer_style not in VALID_ANSWER_STYLES:
raise ValueError(
"Invalid answer style {}. Please choose from {}".format(
answer_style, VALID_ANSWER_STYLES
)
)
inputs = []
answers = []
if answer_style == "sentences" or answer_style == "all":
segments = self._split_into_segments(text)
for segment in segments:
sentences = self._split_text(segment)
prepped_inputs, prepped_answers = self._prepare_qg_inputs(
sentences, segment
)
inputs.extend(prepped_inputs)
answers.extend(prepped_answers)
if answer_style == "multiple_choice" or answer_style == "all":
sentences = self._split_text(text)
prepped_inputs, prepped_answers = self._prepare_qg_inputs_MC(sentences)
inputs.extend(prepped_inputs)
answers.extend(prepped_answers)
return inputs, answers
def generate_questions_from_inputs(self, qg_inputs):
generated_questions = []
for qg_input in qg_inputs:
question = self._generate_question(qg_input)
generated_questions.append(question)
return generated_questions
def _split_text(self, text):
MAX_SENTENCE_LEN = 128
sentences = re.findall(".*?[.!\?]", text)
cut_sentences = []
for sentence in sentences:
if len(sentence) > MAX_SENTENCE_LEN:
cut_sentences.extend(re.split("[,;:)]", sentence))
# temporary solution to remove useless post-quote sentence fragments
cut_sentences = [s for s in sentences if len(s.split(" ")) > 5]
sentences = sentences + cut_sentences
return list(set([s.strip(" ") for s in sentences]))
def _split_into_segments(self, text):
MAX_TOKENS = 490
paragraphs = text.split("\n")
tokenized_paragraphs = [
self.qg_tokenizer(p)["input_ids"] for p in paragraphs if len(p) > 0
]
segments = []
while len(tokenized_paragraphs) > 0:
segment = []
while len(segment) < MAX_TOKENS and len(tokenized_paragraphs) > 0:
paragraph = tokenized_paragraphs.pop(0)
segment.extend(paragraph)
segments.append(segment)
return [self.qg_tokenizer.decode(s) for s in segments]
def _prepare_qg_inputs(self, sentences, text):
inputs = []
answers = []
for sentence in sentences:
qg_input = "{} {} {} {}".format(
self.ANSWER_TOKEN, sentence, self.CONTEXT_TOKEN, text
)
inputs.append(qg_input)
answers.append(sentence)
return inputs, answers
def _prepare_qg_inputs_MC(self, sentences):
spacy_nlp = en_core_web_sm.load()
docs = list(spacy_nlp.pipe(sentences, disable=["parser"]))
inputs_from_text = []
answers_from_text = []
for i in range(len(sentences)):
entities = docs[i].ents
if entities:
for entity in entities:
qg_input = "{} {} {} {}".format(
self.ANSWER_TOKEN, entity, self.CONTEXT_TOKEN, sentences[i]
)
answers = self._get_MC_answers(entity, docs)
inputs_from_text.append(qg_input)
answers_from_text.append(answers)
return inputs_from_text, answers_from_text
def _get_MC_answers(self, correct_answer, docs):
entities = []
for doc in docs:
entities.extend([{"text": e.text, "label_": e.label_} for e in doc.ents])
# remove duplicate elements
entities_json = [json.dumps(kv) for kv in entities]
pool = set(entities_json)
num_choices = (
min(4, len(pool)) - 1
) # -1 because we already have the correct answer
# add the correct answer
final_choices = []
correct_label = correct_answer.label_
final_choices.append({"answer": correct_answer.text, "correct": True})
pool.remove(
json.dumps({"text": correct_answer.text, "label_": correct_answer.label_})
)
# find answers with the same NER label
matches = [e for e in pool if correct_label in e]
# if we don't have enough then add some other random answers
if len(matches) < num_choices:
choices = matches
pool = pool.difference(set(choices))
choices.extend(random.sample(pool, num_choices - len(choices)))
else:
choices = random.sample(matches, num_choices)
choices = [json.loads(s) for s in choices]
for choice in choices:
final_choices.append({"answer": choice["text"], "correct": False})
random.shuffle(final_choices)
return final_choices
def _generate_question(self, qg_input):
self.qg_model.eval()
encoded_input = self._encode_qg_input(qg_input)
with torch.no_grad():
output = self.qg_model.generate(input_ids=encoded_input["input_ids"])
question = self.qg_tokenizer.decode(output[0], skip_special_tokens=True)
return question
def _encode_qg_input(self, qg_input):
return self.qg_tokenizer(
qg_input,
padding='max_length',
max_length=self.SEQ_LENGTH,
truncation=True,
return_tensors="pt",
).to(self.device)
def _get_ranked_qa_pairs(
self, generated_questions, qg_answers, scores, num_questions=10
):
if num_questions > len(scores):
num_questions = len(scores)
print(
"\nWas only able to generate {} questions. For more questions, please input a longer text.".format(
num_questions
)
)
qa_list = []
for i in range(num_questions):
index = scores[i]
qa = self._make_dict(
generated_questions[index].split("?")[0] + "?", qg_answers[index]
)
qa_list.append(qa)
return qa_list
def _get_all_qa_pairs(self, generated_questions, qg_answers):
qa_list = []
for i in range(len(generated_questions)):
qa = self._make_dict(
generated_questions[i].split("?")[0] + "?", qg_answers[i]
)
qa_list.append(qa)
return qa_list
def _make_dict(self, question, answer):
qa = {}
qa["question"] = question
qa["answer"] = answer
return qa
class QAEvaluator:
def __init__(self, model_dir=None):
QAE_PRETRAINED = "iarfmoose/bert-base-cased-qa-evaluator"
self.SEQ_LENGTH = 512
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.qae_tokenizer = AutoTokenizer.from_pretrained(QAE_PRETRAINED)
self.qae_model = AutoModelForSequenceClassification.from_pretrained(
QAE_PRETRAINED
)
self.qae_model.to(self.device)
def encode_qa_pairs(self, questions, answers):
encoded_pairs = []
for i in range(len(questions)):
encoded_qa = self._encode_qa(questions[i], answers[i])
encoded_pairs.append(encoded_qa.to(self.device))
return encoded_pairs
def get_scores(self, encoded_qa_pairs):
scores = {}
self.qae_model.eval()
with torch.no_grad():
for i in range(len(encoded_qa_pairs)):
scores[i] = self._evaluate_qa(encoded_qa_pairs[i])
return [
k for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)
]
def _encode_qa(self, question, answer):
if type(answer) is list:
for a in answer:
if a["correct"]:
correct_answer = a["answer"]
else:
correct_answer = answer
return self.qae_tokenizer(
text=question,
text_pair=correct_answer,
padding="max_length",
max_length=self.SEQ_LENGTH,
truncation=True,
return_tensors="pt",
)
def _evaluate_qa(self, encoded_qa_pair):
output = self.qae_model(**encoded_qa_pair)
return output[0][0][1]
def print_qa(qa_list, show_answers=True):
for i in range(len(qa_list)):
space = " " * int(np.where(i < 9, 3, 4)) # wider space for 2 digit q nums
print("{}) Q: {}".format(i + 1, qa_list[i]["question"]))
answer = qa_list[i]["answer"]
# print a list of multiple choice answers
if type(answer) is list:
if show_answers:
print(
"{}A: 1.".format(space),
answer[0]["answer"],
np.where(answer[0]["correct"], "(correct)", ""),
)
for j in range(1, len(answer)):
print(
"{}{}.".format(space + " ", j + 1),
answer[j]["answer"],
np.where(answer[j]["correct"] == True, "(correct)", ""),
)
else:
print("{}A: 1.".format(space), answer[0]["answer"])
for j in range(1, len(answer)):
print("{}{}.".format(space + " ", j + 1), answer[j]["answer"])
print("")
# print full sentence answers
else:
if show_answers:
print("{}A:".format(space), answer, "\n")