Spaces:

pragnakalp
/

Question_Generation_T5

Running

App Files Files Community

Question_Generation_T5 / questiongenerator.py

pragnakalp

Upload questiongenerator.py

5c4ebbb over 2 years ago

raw

history blame

12.6 kB

	import os
	import sys
	import math
	import numpy as np
	import torch
	import spacy
	import re
	import random
	import json
	import en_core_web_sm
	from string import punctuation

	#from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
	#from transformers import BertTokenizer, BertForSequenceClassification
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
	class QuestionGenerator():

	def __init__(self, model_dir=None):

	QG_PRETRAINED = 'iarfmoose/t5-base-question-generator'
	self.ANSWER_TOKEN = '<answer>'
	self.CONTEXT_TOKEN = '<context>'
	self.SEQ_LENGTH = 512

	self.device = torch.device('cpu')
	# self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	self.qg_tokenizer = AutoTokenizer.from_pretrained(QG_PRETRAINED)
	self.qg_model = AutoModelForSeq2SeqLM.from_pretrained(QG_PRETRAINED)
	self.qg_model.to(self.device)

	self.qa_evaluator = QAEvaluator(model_dir)

	def generate(self, article, use_evaluator=True, num_questions=None, answer_style='all'):

	print("Generating questions...\n")

	qg_inputs, qg_answers = self.generate_qg_inputs(article, answer_style)
	print("qg_inputs, qg_answers=>",qg_inputs, qg_answers)
	generated_questions = self.generate_questions_from_inputs(qg_inputs,num_questions)
	print("generated_questions(generate)=>",generated_questions)
	return generated_questions
	message = "{} questions doesn't match {} answers".format(
	len(generated_questions),
	len(qg_answers))
	assert len(generated_questions) == len(qg_answers), message

	if use_evaluator:

	print("Evaluating QA pairs...\n")

	encoded_qa_pairs = self.qa_evaluator.encode_qa_pairs(generated_questions, qg_answers)
	scores = self.qa_evaluator.get_scores(encoded_qa_pairs)
	if num_questions:
	qa_list = self._get_ranked_qa_pairs(generated_questions, qg_answers, scores, num_questions)
	else:
	qa_list = self._get_ranked_qa_pairs(generated_questions, qg_answers, scores)

	else:
	print("Skipping evaluation step.\n")
	qa_list = self._get_all_qa_pairs(generated_questions, qg_answers)

	return qa_list

	def generate_qg_inputs(self, text, answer_style):

	VALID_ANSWER_STYLES = ['all', 'sentences', 'multiple_choice']

	if answer_style not in VALID_ANSWER_STYLES:
	raise ValueError(
	"Invalid answer style {}. Please choose from {}".format(
	answer_style,
	VALID_ANSWER_STYLES
	)
	)

	inputs = []
	answers = []

	if answer_style == 'sentences' or answer_style == 'all':
	segments = self._split_into_segments(text)
	for segment in segments:
	sentences = self._split_text(segment)
	prepped_inputs, prepped_answers = self._prepare_qg_inputs(sentences, segment)
	inputs.extend(prepped_inputs)
	answers.extend(prepped_answers)

	if answer_style == 'multiple_choice' or answer_style == 'all':
	sentences = self._split_text(text)
	prepped_inputs, prepped_answers = self._prepare_qg_inputs_MC(sentences)
	inputs.extend(prepped_inputs)
	answers.extend(prepped_answers)

	return inputs, answers

	def generate_questions_from_inputs(self, qg_inputs,num_questions):
	generated_questions = []
	count = 0
	print("num que => ", num_questions)
	for qg_input in qg_inputs:
	if count < int(num_questions):
	question = self._generate_question(qg_input)

	question = question.strip() #remove trailing spaces
	question = question.strip(punctuation) #remove trailing questionmarks
	question += "?" #add one ?
	if question not in generated_questions:
	generated_questions.append(question)
	print("question ===> ",question)
	count += 1
	else:
	return generated_questions
	return generated_questions #
	def _split_text(self, text):
	MAX_SENTENCE_LEN = 128

	sentences = re.findall('.*?[.!\?]', text)

	cut_sentences = []
	for sentence in sentences:
	if len(sentence) > MAX_SENTENCE_LEN:
	cut_sentences.extend(re.split('[,;:)]', sentence))
	# temporary solution to remove useless post-quote sentence fragments
	cut_sentences = [s for s in sentences if len(s.split(" ")) > 5]
	sentences = sentences + cut_sentences

	return list(set([s.strip(" ") for s in sentences]))

	def _split_into_segments(self, text):
	MAX_TOKENS = 490

	paragraphs = text.split('\n')
	tokenized_paragraphs = [self.qg_tokenizer(p)['input_ids'] for p in paragraphs if len(p) > 0]

	segments = []
	while len(tokenized_paragraphs) > 0:
	segment = []
	while len(segment) < MAX_TOKENS and len(tokenized_paragraphs) > 0:
	paragraph = tokenized_paragraphs.pop(0)
	segment.extend(paragraph)
	segments.append(segment)
	return [self.qg_tokenizer.decode(s) for s in segments]

	def _prepare_qg_inputs(self, sentences, text):
	inputs = []
	answers = []

	for sentence in sentences:
	qg_input = '{} {} {} {}'.format(
	self.ANSWER_TOKEN,
	sentence,
	self.CONTEXT_TOKEN,
	text
	)
	inputs.append(qg_input)
	answers.append(sentence)

	return inputs, answers

	def _prepare_qg_inputs_MC(self, sentences):

	spacy_nlp = en_core_web_sm.load()
	docs = list(spacy_nlp.pipe(sentences, disable=['parser']))
	inputs_from_text = []
	answers_from_text = []

	for i in range(len(sentences)):
	entities = docs[i].ents
	if entities:
	for entity in entities:
	qg_input = '{} {} {} {}'.format(
	self.ANSWER_TOKEN,
	entity,
	self.CONTEXT_TOKEN,
	sentences[i]
	)
	answers = self._get_MC_answers(entity, docs)
	inputs_from_text.append(qg_input)
	answers_from_text.append(answers)

	return inputs_from_text, answers_from_text

	def _get_MC_answers(self, correct_answer, docs):

	entities = []
	for doc in docs:
	entities.extend([{'text': e.text, 'label_': e.label_} for e in doc.ents])

	# remove duplicate elements
	entities_json = [json.dumps(kv) for kv in entities]
	pool = set(entities_json)
	num_choices = min(4, len(pool)) - 1 # -1 because we already have the correct answer

	# add the correct answer
	final_choices = []
	correct_label = correct_answer.label_
	final_choices.append({'answer': correct_answer.text, 'correct': True})
	pool.remove(json.dumps({'text': correct_answer.text, 'label_': correct_answer.label_}))

	# find answers with the same NER label
	matches = [e for e in pool if correct_label in e]

	# if we don't have enough then add some other random answers
	if len(matches) < num_choices:
	choices = matches
	pool = pool.difference(set(choices))
	choices.extend(random.sample(pool, num_choices - len(choices)))
	else:
	choices = random.sample(matches, num_choices)

	choices = [json.loads(s) for s in choices]
	for choice in choices:
	final_choices.append({'answer': choice['text'], 'correct': False})
	random.shuffle(final_choices)
	return final_choices

	def _generate_question(self, qg_input):
	self.qg_model.eval()
	encoded_input = self._encode_qg_input(qg_input)
	with torch.no_grad():
	output = self.qg_model.generate(input_ids=encoded_input['input_ids'])
	return self.qg_tokenizer.decode(output[0])

	def _encode_qg_input(self, qg_input):
	return self.qg_tokenizer(
	qg_input,
	pad_to_max_length=True,
	max_length=self.SEQ_LENGTH,
	truncation=True,
	return_tensors="pt"
	).to(self.device)

	def _get_ranked_qa_pairs(self, generated_questions, qg_answers, scores, num_questions=10):
	if num_questions > len(scores):
	num_questions = len(scores)
	print("\nWas only able to generate {} questions. For more questions, please input a longer text.".format(num_questions))

	qa_list = []
	for i in range(num_questions):
	index = scores[i]
	qa = self._make_dict(
	generated_questions[index].split('?')[0] + '?',
	qg_answers[index])
	qa_list.append(qa)
	return qa_list

	def _get_all_qa_pairs(self, generated_questions, qg_answers):
	qa_list = []
	for i in range(len(generated_questions)):
	qa = self._make_dict(
	generated_questions[i].split('?')[0] + '?',
	qg_answers[i])
	qa_list.append(qa)
	return qa_list

	def _make_dict(self, question, answer):
	qa = {}
	qa['question'] = question
	qa['answer'] = answer
	return qa


	class QAEvaluator():
	def __init__(self, model_dir=None):

	QAE_PRETRAINED = 'iarfmoose/bert-base-cased-qa-evaluator'
	self.SEQ_LENGTH = 512

	self.device = torch.device('cpu')
	# self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	self.qae_tokenizer = AutoTokenizer.from_pretrained(QAE_PRETRAINED)
	self.qae_model = AutoModelForSequenceClassification.from_pretrained(QAE_PRETRAINED)
	self.qae_model.to(self.device)


	def encode_qa_pairs(self, questions, answers):
	encoded_pairs = []
	for i in range(len(questions)):
	encoded_qa = self._encode_qa(questions[i], answers[i])
	encoded_pairs.append(encoded_qa.to(self.device))
	return encoded_pairs

	def get_scores(self, encoded_qa_pairs):
	scores = {}
	self.qae_model.eval()
	with torch.no_grad():
	for i in range(len(encoded_qa_pairs)):
	scores[i] = self._evaluate_qa(encoded_qa_pairs[i])

	return [k for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)]

	def _encode_qa(self, question, answer):
	if type(answer) is list:
	for a in answer:
	if a['correct']:
	correct_answer = a['answer']
	else:
	correct_answer = answer
	return self.qae_tokenizer(
	text=question,
	text_pair=correct_answer,
	pad_to_max_length=True,
	max_length=self.SEQ_LENGTH,
	truncation=True,
	return_tensors="pt"
	)

	def _evaluate_qa(self, encoded_qa_pair):
	output = self.qae_model(**encoded_qa_pair)
	return output[0][0][1]


	def print_qa(qa_list, show_answers=True):
	for i in range(len(qa_list)):
	space = ' ' * int(np.where(i < 9, 3, 4)) # wider space for 2 digit q nums

	print('{}) Q: {}'.format(i + 1, qa_list[i]['question']))

	answer = qa_list[i]['answer']

	# print a list of multiple choice answers
	if type(answer) is list:

	if show_answers:
	print('{}A: 1.'.format(space),
	answer[0]['answer'],
	np.where(answer[0]['correct'], '(correct)', ''))
	for j in range(1, len(answer)):
	print('{}{}.'.format(space + ' ', j + 1),
	answer[j]['answer'],
	np.where(answer[j]['correct'] == True, '(correct)', ''))

	else:
	print('{}A: 1.'.format(space),
	answer[0]['answer'])
	for j in range(1, len(answer)):
	print('{}{}.'.format(space + ' ', j + 1),
	answer[j]['answer'])
	print('')

	# print full sentence answers
	else:
	if show_answers:
	print('{}A:'.format(space), answer, '\n')