Spaces:

ThePixOne
/

open_domain_qa

Running

App Files Files Community

open_domain_qa / app.py

ThePixOne

Update app.py

fc7d330 verified 7 months ago

raw

history blame

8.47 kB

	"""
	Gradio requires input to be fed in a very peculiar way and does not provide too much flexibility - don't expect from this demo too much. The backbone had to be adjusted to work on hugging face spaces. Go see https://github.com/PiotrAntoniak/QuestionAnswering for a prettier version utilizing streamlit.
	"""


	import gradio as gr

	description = """Do you have a long document and a bunch of questions that can be answered given the data in this file?
	Fear not for this demo is for you.
	Upload your pdf, ask your questions and wait for the magic to happen.
	DISCLAIMER: I do no have idea what happens to the pdfs that you upload and who has access to them so make sure there is nothing confidential there.
	"""
	title = "QA answering from a pdf."

	from datetime import datetime
	import numpy as np
	import time
	import hashlib
	import torch
	from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, pipeline
	from tqdm import tqdm
	import os
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	import textract
	from scipy.special import softmax
	import pandas as pd
	from datetime import datetime


	tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
	model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
	tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
	model_ans = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2").to(device).eval()

	if device == 'cuda:0':
	pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans,device = 0)
	else:
	pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans)

	def cls_pooling(model_output):
	return model_output.last_hidden_state[:,0]

	def encode_query(query):
	encoded_input = tokenizer(query, truncation=True, return_tensors='pt').to(device)

	with torch.no_grad():
	model_output = model(**encoded_input, return_dict=True)

	embeddings = cls_pooling(model_output)

	return embeddings.cpu()


	def encode_docs(docs,maxlen = 64, stride = 32):
	encoded_input = []
	embeddings = []
	spans = []
	file_names = []
	name, text = docs
	temp_text = ""
	text = text.split(" ")
	if len(text) < maxlen:
	text = " ".join(text)

	encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True).to(device))
	spans.append(temp_text)
	file_names.append(name)

	else:
	num_iters = int(len(text)/maxlen)+1
	for i in range(num_iters):
	if i == 0:
	temp_text = " ".join(text[imaxlen:(i+1)maxlen+stride])
	else:
	temp_text = " ".join(text[(i-1)maxlen:(i)maxlen][-stride:] + text[imaxlen:(i+1)maxlen])

	encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True).to(device))
	spans.append(temp_text)
	file_names.append(name)

	with torch.no_grad():
	for encoded in tqdm(encoded_input):
	model_output = model(**encoded, return_dict=True)
	embeddings.append(cls_pooling(model_output))

	embeddings = np.float32(torch.stack(embeddings).transpose(0, 1).cpu())

	np.save("emb_{}.npy".format(name),dict(zip(list(range(len(embeddings))),embeddings)))
	np.save("spans_{}.npy".format(name),dict(zip(list(range(len(spans))),spans)))
	np.save("file_{}.npy".format(name),dict(zip(list(range(len(file_names))),file_names)))

	return embeddings, spans, file_names

	def predict(query,data):
	print(datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
	name_to_save = data.name.split("/")[-1].split(".")[0][:-8]
	k=20
	st = str([query,name_to_save])
	st_hashed = str(hashlib.sha256(st.encode()).hexdigest()) #just to speed up examples load
	hist = st + " " + st_hashed
	now = datetime.now()
	current_time = now.strftime("%H:%M:%S")

	try: #if the same question was already asked for this document, upload question and answer
	df = pd.read_csv("{}.csv".format(hash(st)))
	list_outputs = []
	for i in range(k):
	temp = [df.iloc[n] for n in range(k)][i]
	text = ''
	text += 'PROBABILITIES: '+ temp.Probabilities + '\n\n'
	text += 'ANSWER: ' +temp.Answer + '\n\n'
	text += 'PASSAGE: '+temp.Passage + '\n\n'
	list_outputs.append(text)
	return list_outputs
	except Exception as e:
	print(e)
	print(st)

	if name_to_save+".txt" in os.listdir(): #if the document was already used, load its embeddings
	doc_emb = np.load('emb_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
	doc_text = np.load('spans_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
	file_names_dicto = np.load('file_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()

	doc_emb = np.array(list(doc_emb.values())).reshape(-1,768)
	doc_text = list(doc_text.values())
	file_names = list(file_names_dicto.values())

	else:
	text = textract.process("{}".format(data.name)).decode('utf8')
	text = text.replace("\r", " ")
	text = text.replace("\n", " ")
	text = text.replace(" . "," ")

	doc_emb, doc_text, file_names = encode_docs((name_to_save,text),maxlen = 64, stride = 32)

	doc_emb = doc_emb.reshape(-1, 768)
	with open("{}.txt".format(name_to_save),"w",encoding="utf-8") as f:
	f.write(text)

	#once embeddings are calculated, run MIPS
	start = time.time()
	query_emb = encode_query(query)

	scores = np.matmul(query_emb, doc_emb.transpose(1,0))[0].tolist()
	doc_score_pairs = list(zip(doc_text, scores, file_names))
	doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

	probs_sum = 0
	probs = softmax(sorted(scores,reverse = True)[:k])
	table = {"Passage":[],"Answer":[],"Probabilities":[]}


	#get answers for each pair of question (from user) and top best passages
	for i, (passage, _, names) in enumerate(doc_score_pairs[:k]):
	passage = passage.replace("\n","")
	#passage = passage.replace(" . "," ")

	if probs[i] > 0.1 or (i < 3 and probs[i] > 0.05): #generate answers for more likely passages but no less than 2
	QA = {'question':query,'context':passage}
	ans = pipe(QA)
	probabilities = "P(a\|p): {}, P(a\|p,q): {}, P(p\|q): {}".format(round(ans["score"],5),
	round(ans["score"]*probs[i],5),
	round(probs[i],5))
	table["Passage"].append(passage)
	table["Answer"].append(str(ans["answer"]).upper())
	table["Probabilities"].append(probabilities)
	else:
	table["Passage"].append(passage)
	table["Answer"].append("no_answer_calculated")
	table["Probabilities"].append("P(p\|q): {}".format(round(probs[i],5)))


	#format answers for ~nice output and save it for future (if the same question is asked again using same pdf)
	df = pd.DataFrame(table)
	print(df)
	print("time: "+ str(time.time()-start))


	with open("HISTORY.txt","a", encoding = "utf-8") as f:
	f.write(hist)
	f.write(" " + str(current_time))
	f.write("\n")
	f.close()
	df.to_csv("{}.csv".format(hash(st)), index=False)

	list_outputs = []
	for i in range(k):
	text = ''
	temp = [df.iloc[n] for n in range(k)][i]
	text += 'PROBABILITIES: '+ temp.Probabilities + '\n\n'
	text += 'ANSWER: ' +temp.Answer + '\n\n'
	text += 'PASSAGE: '+temp.Passage + '\n\n'

	list_outputs.append(text)

	return list_outputs



	iface = gr.Interface(examples = [
	["How high is the highest mountain?","China.pdf"],
	["Where is the highest mountain?","China.pdf"]
	],

	fn =predict,
	inputs = [gr.inputs.Textbox(default="What is Open-domain question answering?"),
	gr.inputs.File(),
	],
	outputs = 'text',
	description=description,
	title = title,
	allow_flagging ="manual",flagging_options = ["correct","wrong"],
	allow_screenshot=False)

	iface.launch(enable_queue=True, show_error =True)