|
import os |
|
import sys |
|
import math |
|
import numpy as np |
|
import torch |
|
import spacy |
|
import re |
|
import random |
|
import json |
|
import en_core_web_sm |
|
from string import punctuation |
|
|
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification |
|
class QuestionGenerator(): |
|
|
|
def __init__(self, model_dir=None): |
|
|
|
QG_PRETRAINED = 'iarfmoose/t5-base-question-generator' |
|
self.ANSWER_TOKEN = '<answer>' |
|
self.CONTEXT_TOKEN = '<context>' |
|
self.SEQ_LENGTH = 512 |
|
|
|
self.device = torch.device('cpu') |
|
|
|
|
|
self.qg_tokenizer = AutoTokenizer.from_pretrained(QG_PRETRAINED) |
|
self.qg_model = AutoModelForSeq2SeqLM.from_pretrained(QG_PRETRAINED) |
|
self.qg_model.to(self.device) |
|
|
|
self.qa_evaluator = QAEvaluator(model_dir) |
|
|
|
def generate(self, article, use_evaluator=True, num_questions=None, answer_style='all'): |
|
|
|
print("Generating questions...\n") |
|
|
|
qg_inputs, qg_answers = self.generate_qg_inputs(article, answer_style) |
|
print("qg_inputs, qg_answers=>",qg_inputs, qg_answers) |
|
generated_questions = self.generate_questions_from_inputs(qg_inputs,num_questions) |
|
print("generated_questions(generate)=>",generated_questions) |
|
return generated_questions |
|
message = "{} questions doesn't match {} answers".format( |
|
len(generated_questions), |
|
len(qg_answers)) |
|
assert len(generated_questions) == len(qg_answers), message |
|
|
|
if use_evaluator: |
|
|
|
print("Evaluating QA pairs...\n") |
|
|
|
encoded_qa_pairs = self.qa_evaluator.encode_qa_pairs(generated_questions, qg_answers) |
|
scores = self.qa_evaluator.get_scores(encoded_qa_pairs) |
|
if num_questions: |
|
qa_list = self._get_ranked_qa_pairs(generated_questions, qg_answers, scores, num_questions) |
|
else: |
|
qa_list = self._get_ranked_qa_pairs(generated_questions, qg_answers, scores) |
|
|
|
else: |
|
print("Skipping evaluation step.\n") |
|
qa_list = self._get_all_qa_pairs(generated_questions, qg_answers) |
|
|
|
return qa_list |
|
|
|
def generate_qg_inputs(self, text, answer_style): |
|
|
|
VALID_ANSWER_STYLES = ['all', 'sentences', 'multiple_choice'] |
|
|
|
if answer_style not in VALID_ANSWER_STYLES: |
|
raise ValueError( |
|
"Invalid answer style {}. Please choose from {}".format( |
|
answer_style, |
|
VALID_ANSWER_STYLES |
|
) |
|
) |
|
|
|
inputs = [] |
|
answers = [] |
|
|
|
if answer_style == 'sentences' or answer_style == 'all': |
|
segments = self._split_into_segments(text) |
|
for segment in segments: |
|
sentences = self._split_text(segment) |
|
prepped_inputs, prepped_answers = self._prepare_qg_inputs(sentences, segment) |
|
inputs.extend(prepped_inputs) |
|
answers.extend(prepped_answers) |
|
|
|
if answer_style == 'multiple_choice' or answer_style == 'all': |
|
sentences = self._split_text(text) |
|
prepped_inputs, prepped_answers = self._prepare_qg_inputs_MC(sentences) |
|
inputs.extend(prepped_inputs) |
|
answers.extend(prepped_answers) |
|
|
|
return inputs, answers |
|
|
|
def generate_questions_from_inputs(self, qg_inputs,num_questions): |
|
generated_questions = [] |
|
count = 0 |
|
print("num que => ", num_questions) |
|
for qg_input in qg_inputs: |
|
if count < int(num_questions): |
|
question = self._generate_question(qg_input) |
|
|
|
question = question.strip() |
|
question = question.strip(punctuation) |
|
question += "?" |
|
if question not in generated_questions: |
|
generated_questions.append(question) |
|
print("question ===> ",question) |
|
count += 1 |
|
else: |
|
return generated_questions |
|
return generated_questions |
|
def _split_text(self, text): |
|
MAX_SENTENCE_LEN = 128 |
|
|
|
sentences = re.findall('.*?[.!\?]', text) |
|
|
|
cut_sentences = [] |
|
for sentence in sentences: |
|
if len(sentence) > MAX_SENTENCE_LEN: |
|
cut_sentences.extend(re.split('[,;:)]', sentence)) |
|
|
|
cut_sentences = [s for s in sentences if len(s.split(" ")) > 5] |
|
sentences = sentences + cut_sentences |
|
|
|
return list(set([s.strip(" ") for s in sentences])) |
|
|
|
def _split_into_segments(self, text): |
|
MAX_TOKENS = 490 |
|
|
|
paragraphs = text.split('\n') |
|
tokenized_paragraphs = [self.qg_tokenizer(p)['input_ids'] for p in paragraphs if len(p) > 0] |
|
|
|
segments = [] |
|
while len(tokenized_paragraphs) > 0: |
|
segment = [] |
|
while len(segment) < MAX_TOKENS and len(tokenized_paragraphs) > 0: |
|
paragraph = tokenized_paragraphs.pop(0) |
|
segment.extend(paragraph) |
|
segments.append(segment) |
|
return [self.qg_tokenizer.decode(s) for s in segments] |
|
|
|
def _prepare_qg_inputs(self, sentences, text): |
|
inputs = [] |
|
answers = [] |
|
|
|
for sentence in sentences: |
|
qg_input = '{} {} {} {}'.format( |
|
self.ANSWER_TOKEN, |
|
sentence, |
|
self.CONTEXT_TOKEN, |
|
text |
|
) |
|
inputs.append(qg_input) |
|
answers.append(sentence) |
|
|
|
return inputs, answers |
|
|
|
def _prepare_qg_inputs_MC(self, sentences): |
|
|
|
spacy_nlp = en_core_web_sm.load() |
|
docs = list(spacy_nlp.pipe(sentences, disable=['parser'])) |
|
inputs_from_text = [] |
|
answers_from_text = [] |
|
|
|
for i in range(len(sentences)): |
|
entities = docs[i].ents |
|
if entities: |
|
for entity in entities: |
|
qg_input = '{} {} {} {}'.format( |
|
self.ANSWER_TOKEN, |
|
entity, |
|
self.CONTEXT_TOKEN, |
|
sentences[i] |
|
) |
|
answers = self._get_MC_answers(entity, docs) |
|
inputs_from_text.append(qg_input) |
|
answers_from_text.append(answers) |
|
|
|
return inputs_from_text, answers_from_text |
|
|
|
def _get_MC_answers(self, correct_answer, docs): |
|
|
|
entities = [] |
|
for doc in docs: |
|
entities.extend([{'text': e.text, 'label_': e.label_} for e in doc.ents]) |
|
|
|
|
|
entities_json = [json.dumps(kv) for kv in entities] |
|
pool = set(entities_json) |
|
num_choices = min(4, len(pool)) - 1 |
|
|
|
|
|
final_choices = [] |
|
correct_label = correct_answer.label_ |
|
final_choices.append({'answer': correct_answer.text, 'correct': True}) |
|
pool.remove(json.dumps({'text': correct_answer.text, 'label_': correct_answer.label_})) |
|
|
|
|
|
matches = [e for e in pool if correct_label in e] |
|
|
|
|
|
if len(matches) < num_choices: |
|
choices = matches |
|
pool = pool.difference(set(choices)) |
|
choices.extend(random.sample(pool, num_choices - len(choices))) |
|
else: |
|
choices = random.sample(matches, num_choices) |
|
|
|
choices = [json.loads(s) for s in choices] |
|
for choice in choices: |
|
final_choices.append({'answer': choice['text'], 'correct': False}) |
|
random.shuffle(final_choices) |
|
return final_choices |
|
|
|
def _generate_question(self, qg_input): |
|
self.qg_model.eval() |
|
encoded_input = self._encode_qg_input(qg_input) |
|
with torch.no_grad(): |
|
output = self.qg_model.generate(input_ids=encoded_input['input_ids']) |
|
return self.qg_tokenizer.decode(output[0]) |
|
|
|
def _encode_qg_input(self, qg_input): |
|
return self.qg_tokenizer( |
|
qg_input, |
|
pad_to_max_length=True, |
|
max_length=self.SEQ_LENGTH, |
|
truncation=True, |
|
return_tensors="pt" |
|
).to(self.device) |
|
|
|
def _get_ranked_qa_pairs(self, generated_questions, qg_answers, scores, num_questions=10): |
|
if num_questions > len(scores): |
|
num_questions = len(scores) |
|
print("\nWas only able to generate {} questions. For more questions, please input a longer text.".format(num_questions)) |
|
|
|
qa_list = [] |
|
for i in range(num_questions): |
|
index = scores[i] |
|
qa = self._make_dict( |
|
generated_questions[index].split('?')[0] + '?', |
|
qg_answers[index]) |
|
qa_list.append(qa) |
|
return qa_list |
|
|
|
def _get_all_qa_pairs(self, generated_questions, qg_answers): |
|
qa_list = [] |
|
for i in range(len(generated_questions)): |
|
qa = self._make_dict( |
|
generated_questions[i].split('?')[0] + '?', |
|
qg_answers[i]) |
|
qa_list.append(qa) |
|
return qa_list |
|
|
|
def _make_dict(self, question, answer): |
|
qa = {} |
|
qa['question'] = question |
|
qa['answer'] = answer |
|
return qa |
|
|
|
|
|
class QAEvaluator(): |
|
def __init__(self, model_dir=None): |
|
|
|
QAE_PRETRAINED = 'iarfmoose/bert-base-cased-qa-evaluator' |
|
self.SEQ_LENGTH = 512 |
|
|
|
self.device = torch.device('cpu') |
|
|
|
|
|
self.qae_tokenizer = AutoTokenizer.from_pretrained(QAE_PRETRAINED) |
|
self.qae_model = AutoModelForSequenceClassification.from_pretrained(QAE_PRETRAINED) |
|
self.qae_model.to(self.device) |
|
|
|
|
|
def encode_qa_pairs(self, questions, answers): |
|
encoded_pairs = [] |
|
for i in range(len(questions)): |
|
encoded_qa = self._encode_qa(questions[i], answers[i]) |
|
encoded_pairs.append(encoded_qa.to(self.device)) |
|
return encoded_pairs |
|
|
|
def get_scores(self, encoded_qa_pairs): |
|
scores = {} |
|
self.qae_model.eval() |
|
with torch.no_grad(): |
|
for i in range(len(encoded_qa_pairs)): |
|
scores[i] = self._evaluate_qa(encoded_qa_pairs[i]) |
|
|
|
return [k for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)] |
|
|
|
def _encode_qa(self, question, answer): |
|
if type(answer) is list: |
|
for a in answer: |
|
if a['correct']: |
|
correct_answer = a['answer'] |
|
else: |
|
correct_answer = answer |
|
return self.qae_tokenizer( |
|
text=question, |
|
text_pair=correct_answer, |
|
pad_to_max_length=True, |
|
max_length=self.SEQ_LENGTH, |
|
truncation=True, |
|
return_tensors="pt" |
|
) |
|
|
|
def _evaluate_qa(self, encoded_qa_pair): |
|
output = self.qae_model(**encoded_qa_pair) |
|
return output[0][0][1] |
|
|
|
|
|
def print_qa(qa_list, show_answers=True): |
|
for i in range(len(qa_list)): |
|
space = ' ' * int(np.where(i < 9, 3, 4)) |
|
|
|
print('{}) Q: {}'.format(i + 1, qa_list[i]['question'])) |
|
|
|
answer = qa_list[i]['answer'] |
|
|
|
|
|
if type(answer) is list: |
|
|
|
if show_answers: |
|
print('{}A: 1.'.format(space), |
|
answer[0]['answer'], |
|
np.where(answer[0]['correct'], '(correct)', '')) |
|
for j in range(1, len(answer)): |
|
print('{}{}.'.format(space + ' ', j + 1), |
|
answer[j]['answer'], |
|
np.where(answer[j]['correct'] == True, '(correct)', '')) |
|
|
|
else: |
|
print('{}A: 1.'.format(space), |
|
answer[0]['answer']) |
|
for j in range(1, len(answer)): |
|
print('{}{}.'.format(space + ' ', j + 1), |
|
answer[j]['answer']) |
|
print('') |
|
|
|
|
|
else: |
|
if show_answers: |
|
print('{}A:'.format(space), answer, '\n') |