QuestMasterAi

Sleeping

App Files Files Community

QuestMasterAi / questiongenerator.py

DpShirazi

Upload 3 files

8c07c55 almost 2 years ago

raw

history blame contribute delete

15.7 kB

	import en_core_web_sm
	import json
	import numpy as np
	import random
	import re
	import torch
	from transformers import (
	AutoTokenizer,
	AutoModelForSeq2SeqLM,
	AutoModelForSequenceClassification,
	)
	from typing import Any, List, Mapping, Tuple


	class QuestionGenerator:
	"""A transformer-based NLP system for generating reading comprehension-style questions from
	texts. It can generate full sentence questions, multiple choice questions, or a mix of the
	two styles.

	To filter out low quality questions, questions are assigned a score and ranked once they have
	been generated. Only the top k questions will be returned. This behaviour can be turned off
	by setting use_evaluator=False.
	"""

	def __init__(self) -> None:

	QG_PRETRAINED = "iarfmoose/t5-base-question-generator"
	self.ANSWER_TOKEN = "<answer>"
	self.CONTEXT_TOKEN = "<context>"
	self.SEQ_LENGTH = 512

	self.device = torch.device(
	"cuda" if torch.cuda.is_available() else "cpu")

	self.qg_tokenizer = AutoTokenizer.from_pretrained(
	QG_PRETRAINED, use_fast=False)
	self.qg_model = AutoModelForSeq2SeqLM.from_pretrained(QG_PRETRAINED)
	self.qg_model.to(self.device)
	self.qg_model.eval()

	self.qa_evaluator = QAEvaluator()

	def generate(
	self,
	article: str,
	use_evaluator: bool = True,
	num_questions: bool = None,
	answer_style: str = "all"
	) -> List:
	"""Takes an article and generates a set of question and answer pairs. If use_evaluator
	is True then QA pairs will be ranked and filtered based on their quality. answer_style
	should selected from ["all", "sentences", "multiple_choice"].
	"""

	print("Generating questions...\n")

	qg_inputs, qg_answers = self.generate_qg_inputs(article, answer_style)
	generated_questions = self.generate_questions_from_inputs(qg_inputs)

	message = "{} questions doesn't match {} answers".format(
	len(generated_questions), len(qg_answers)
	)
	assert len(generated_questions) == len(qg_answers), message

	if use_evaluator:
	print("Evaluating QA pairs...\n")
	encoded_qa_pairs = self.qa_evaluator.encode_qa_pairs(
	generated_questions, qg_answers
	)
	scores = self.qa_evaluator.get_scores(encoded_qa_pairs)

	if num_questions:
	qa_list = self._get_ranked_qa_pairs(
	generated_questions, qg_answers, scores, num_questions
	)
	else:
	qa_list = self._get_ranked_qa_pairs(
	generated_questions, qg_answers, scores
	)

	else:
	print("Skipping evaluation step.\n")
	qa_list = self._get_all_qa_pairs(generated_questions, qg_answers)

	return qa_list

	def generate_qg_inputs(self, text: str, answer_style: str) -> Tuple[List[str], List[str]]:
	"""Given a text, returns a list of model inputs and a list of corresponding answers.
	Model inputs take the form "answer_token <answer text> context_token <context text>" where
	the answer is a string extracted from the text, and the context is the wider text surrounding
	the context.
	"""

	VALID_ANSWER_STYLES = ["all", "sentences", "multiple_choice"]

	if answer_style not in VALID_ANSWER_STYLES:
	raise ValueError(
	"Invalid answer style {}. Please choose from {}".format(
	answer_style, VALID_ANSWER_STYLES
	)
	)

	inputs = []
	answers = []

	if answer_style == "sentences" or answer_style == "all":
	segments = self._split_into_segments(text)

	for segment in segments:
	sentences = self._split_text(segment)
	prepped_inputs, prepped_answers = self._prepare_qg_inputs(
	sentences, segment
	)
	inputs.extend(prepped_inputs)
	answers.extend(prepped_answers)

	if answer_style == "multiple_choice" or answer_style == "all":
	sentences = self._split_text(text)
	prepped_inputs, prepped_answers = self._prepare_qg_inputs_MC(
	sentences
	)
	inputs.extend(prepped_inputs)
	answers.extend(prepped_answers)

	return inputs, answers

	def generate_questions_from_inputs(self, qg_inputs: List) -> List[str]:
	"""Given a list of concatenated answers and contexts, with the form:
	"answer_token <answer text> context_token <context text>", generates a list of
	questions.
	"""
	generated_questions = []

	for qg_input in qg_inputs:
	question = self._generate_question(qg_input)
	generated_questions.append(question)

	return generated_questions

	def _split_text(self, text: str) -> List[str]:
	"""Splits the text into sentences, and attempts to split or truncate long sentences."""
	MAX_SENTENCE_LEN = 128
	sentences = re.findall(".*?[.!\?]", text)
	cut_sentences = []

	for sentence in sentences:
	if len(sentence) > MAX_SENTENCE_LEN:
	cut_sentences.extend(re.split("[,;:)]", sentence))

	# remove useless post-quote sentence fragments
	cut_sentences = [s for s in sentences if len(s.split(" ")) > 5]
	sentences = sentences + cut_sentences

	return list(set([s.strip(" ") for s in sentences]))

	def _split_into_segments(self, text: str) -> List[str]:
	"""Splits a long text into segments short enough to be input into the transformer network.
	Segments are used as context for question generation.
	"""
	MAX_TOKENS = 490
	paragraphs = text.split("\n")
	tokenized_paragraphs = [
	self.qg_tokenizer(p)["input_ids"] for p in paragraphs if len(p) > 0
	]
	segments = []

	while len(tokenized_paragraphs) > 0:
	segment = []

	while len(segment) < MAX_TOKENS and len(tokenized_paragraphs) > 0:
	paragraph = tokenized_paragraphs.pop(0)
	segment.extend(paragraph)
	segments.append(segment)

	return [self.qg_tokenizer.decode(s, skip_special_tokens=True) for s in segments]

	def _prepare_qg_inputs(
	self,
	sentences: List[str],
	text: str
	) -> Tuple[List[str], List[str]]:
	"""Uses sentences as answers and the text as context. Returns a tuple of (model inputs, answers).
	Model inputs are "answer_token <answer text> context_token <context text>"
	"""
	inputs = []
	answers = []

	for sentence in sentences:
	qg_input = f"{self.ANSWER_TOKEN} {sentence} {self.CONTEXT_TOKEN} {text}"
	inputs.append(qg_input)
	answers.append(sentence)

	return inputs, answers

	def _prepare_qg_inputs_MC(self, sentences: List[str]) -> Tuple[List[str], List[str]]:
	"""Performs NER on the text, and uses extracted entities are candidate answers for multiple-choice
	questions. Sentences are used as context, and entities as answers. Returns a tuple of (model inputs, answers).
	Model inputs are "answer_token <answer text> context_token <context text>"
	"""
	spacy_nlp = en_core_web_sm.load()
	docs = list(spacy_nlp.pipe(sentences, disable=["parser"]))
	inputs_from_text = []
	answers_from_text = []

	for doc, sentence in zip(docs, sentences):
	entities = doc.ents
	if entities:

	for entity in entities:
	qg_input = f"{self.ANSWER_TOKEN} {entity} {self.CONTEXT_TOKEN} {sentence}"
	answers = self._get_MC_answers(entity, docs)
	inputs_from_text.append(qg_input)
	answers_from_text.append(answers)

	return inputs_from_text, answers_from_text

	def _get_MC_answers(self, correct_answer: Any, docs: Any) -> List[Mapping[str, Any]]:
	"""Finds a set of alternative answers for a multiple-choice question. Will attempt to find
	alternatives of the same entity type as correct_answer if possible.
	"""
	entities = []

	for doc in docs:
	entities.extend([{"text": e.text, "label_": e.label_}
	for e in doc.ents])

	# remove duplicate elements
	entities_json = [json.dumps(kv) for kv in entities]
	pool = set(entities_json)
	num_choices = (
	min(4, len(pool)) - 1
	) # -1 because we already have the correct answer

	# add the correct answer
	final_choices = []
	correct_label = correct_answer.label_
	final_choices.append({"answer": correct_answer.text, "correct": True})
	pool.remove(
	json.dumps({"text": correct_answer.text,
	"label_": correct_answer.label_})
	)

	# find answers with the same NER label
	matches = [e for e in pool if correct_label in e]

	# if we don't have enough then add some other random answers
	if len(matches) < num_choices:
	choices = matches
	pool = pool.difference(set(choices))
	choices.extend(random.sample(pool, num_choices - len(choices)))
	else:
	choices = random.sample(matches, num_choices)

	choices = [json.loads(s) for s in choices]

	for choice in choices:
	final_choices.append({"answer": choice["text"], "correct": False})

	random.shuffle(final_choices)
	return final_choices

	@torch.no_grad()
	def _generate_question(self, qg_input: str) -> str:
	"""Takes qg_input which is the concatenated answer and context, and uses it to generate
	a question sentence. The generated question is decoded and then returned.
	"""
	encoded_input = self._encode_qg_input(qg_input)
	output = self.qg_model.generate(input_ids=encoded_input["input_ids"])
	question = self.qg_tokenizer.decode(
	output[0],
	skip_special_tokens=True
	)
	return question

	def _encode_qg_input(self, qg_input: str) -> torch.tensor:
	"""Tokenizes a string and returns a tensor of input ids corresponding to indices of tokens in
	the vocab.
	"""
	return self.qg_tokenizer(
	qg_input,
	padding='max_length',
	max_length=self.SEQ_LENGTH,
	truncation=True,
	return_tensors="pt",
	).to(self.device)

	def _get_ranked_qa_pairs(
	self, generated_questions: List[str], qg_answers: List[str], scores, num_questions: int = 10
	) -> List[Mapping[str, str]]:
	"""Ranks generated questions according to scores, and returns the top num_questions examples.
	"""
	if num_questions > len(scores):
	num_questions = len(scores)
	print((
	f"\nWas only able to generate {num_questions} questions.",
	"For more questions, please input a longer text.")
	)

	qa_list = []

	for i in range(num_questions):
	index = scores[i]
	qa = {
	"question": generated_questions[index].split("?")[0] + "?",
	"answer": qg_answers[index]
	}
	qa_list.append(qa)

	return qa_list

	def _get_all_qa_pairs(self, generated_questions: List[str], qg_answers: List[str]):
	"""Formats question and answer pairs without ranking or filtering."""
	qa_list = []

	for question, answer in zip(generated_questions, qg_answers):
	qa = {
	"question": question.split("?")[0] + "?",
	"answer": answer
	}
	qa_list.append(qa)

	return qa_list


	class QAEvaluator:
	"""Wrapper for a transformer model which evaluates the quality of question-answer pairs.
	Given a QA pair, the model will generate a score. Scores can be used to rank and filter
	QA pairs.
	"""

	def __init__(self) -> None:

	QAE_PRETRAINED = "iarfmoose/bert-base-cased-qa-evaluator"
	self.SEQ_LENGTH = 512

	self.device = torch.device(
	"cuda" if torch.cuda.is_available() else "cpu")

	self.qae_tokenizer = AutoTokenizer.from_pretrained(QAE_PRETRAINED)
	self.qae_model = AutoModelForSequenceClassification.from_pretrained(
	QAE_PRETRAINED
	)
	self.qae_model.to(self.device)
	self.qae_model.eval()

	def encode_qa_pairs(self, questions: List[str], answers: List[str]) -> List[torch.tensor]:
	"""Takes a list of questions and a list of answers and encodes them as a list of tensors."""
	encoded_pairs = []

	for question, answer in zip(questions, answers):
	encoded_qa = self._encode_qa(question, answer)
	encoded_pairs.append(encoded_qa.to(self.device))

	return encoded_pairs

	def get_scores(self, encoded_qa_pairs: List[torch.tensor]) -> List[float]:
	"""Generates scores for a list of encoded QA pairs."""
	scores = {}

	for i in range(len(encoded_qa_pairs)):
	scores[i] = self._evaluate_qa(encoded_qa_pairs[i])

	return [
	k for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)
	]

	def _encode_qa(self, question: str, answer: str) -> torch.tensor:
	"""Concatenates a question and answer, and then tokenizes them. Returns a tensor of
	input ids corresponding to indices in the vocab.
	"""
	if type(answer) is list:
	for a in answer:
	if a["correct"]:
	correct_answer = a["answer"]
	else:
	correct_answer = answer

	return self.qae_tokenizer(
	text=question,
	text_pair=correct_answer,
	padding="max_length",
	max_length=self.SEQ_LENGTH,
	truncation=True,
	return_tensors="pt",
	)

	@torch.no_grad()
	def _evaluate_qa(self, encoded_qa_pair: torch.tensor) -> float:
	"""Takes an encoded QA pair and returns a score."""
	output = self.qae_model(**encoded_qa_pair)
	return output[0][0][1]


	def print_qa(qa_list: List[Mapping[str, str]], show_answers: bool = True) -> None:
	"""Formats and prints a list of generated questions and answers."""

	for i in range(len(qa_list)):
	# wider space for 2 digit q nums
	space = " " * int(np.where(i < 9, 3, 4))

	print(f"{i + 1}) Q: {qa_list[i]['question']}")

	answer = qa_list[i]["answer"]

	# print a list of multiple choice answers
	if type(answer) is list:

	if show_answers:
	print(
	f"{space}A: 1. {answer[0]['answer']} "
	f"{np.where(answer[0]['correct'], '(correct)', '')}"
	)
	for j in range(1, len(answer)):
	print(
	f"{space + ' '}{j + 1}. {answer[j]['answer']} "
	f"{np.where(answer[j]['correct']==True,'(correct)', '')}"
	)

	else:
	print(f"{space}A: 1. {answer[0]['answer']}")
	for j in range(1, len(answer)):
	print(f"{space + ' '}{j + 1}. {answer[j]['answer']}")

	print("")

	# print full sentence answers
	else:
	if show_answers:
	print(f"{space}A: {answer}\n")