Spaces:

legacy107
/

flan-t5-large-ia3-wiki

Runtime error

App Files Files Community

flan-t5-large-ia3-wiki / app.py

legacy107

Update app.py

81d062a over 1 year ago

raw

history blame contribute delete

4.03 kB

	import gradio as gr
	from gradio.components import Textbox
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration
	from peft import PeftModel
	import torch
	import datasets
	from sentence_transformers import SentenceTransformer, util
	import math
	import re
	from nltk import sent_tokenize, word_tokenize
	import nltk
	nltk.download('punkt')

	# Load bi encoder
	bi_encoder = SentenceTransformer('legacy107/multi-qa-mpnet-base-dot-v1-wikipedia-search')
	bi_encoder.max_seq_length = 256
	top_k = 3

	# Load your fine-tuned model and tokenizer
	model_name = "legacy107/flan-t5-large-ia3-wiki2-100-merged"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = T5ForConditionalGeneration.from_pretrained(model_name)
	max_length = 512
	max_target_length = 200

	# Load your dataset
	dataset = datasets.load_dataset("legacy107/qa_wikipedia_retrieved_chunks", split="test")
	dataset = dataset.shuffle()
	dataset = dataset.select(range(10))

	# Context chunking
	def chunk_splitter(context, chunk_size=100, overlap=0.20):
	overlap_size = chunk_size * overlap
	sentences = nltk.sent_tokenize(context)

	chunks = []
	text = sentences[0]

	if len(sentences) == 1:
	chunks.append(text)

	i = 1
	while i < len(sentences):
	text += " " + sentences[i]
	i += 1
	while i < len(sentences) and len(nltk.word_tokenize(f"{text} {sentences[i]}")) <= chunk_size:
	text += " " + sentences[i]
	i += 1

	text = text.replace('\"','"').replace("\'","'").replace('\n\n\n'," ").replace('\n\n'," ").replace('\n'," ")
	chunks.append(text)

	if (i >= len(sentences)):
	break

	j = i - 1
	text = sentences[j]
	while j >= 0 and len(nltk.word_tokenize(f"{sentences[j]} {text}")) <= overlap_size:
	text = sentences[j] + " " + text
	j -= 1

	return chunks


	def retrieve_context(query, contexts):
	corpus_embeddings = bi_encoder.encode(contexts, convert_to_tensor=True, show_progress_bar=False)

	question_embedding = bi_encoder.encode(query, convert_to_tensor=True, show_progress_bar=False)
	question_embedding = question_embedding
	hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
	hits = hits[0]

	hits = sorted(hits, key=lambda x: x['score'], reverse=True)
	return " ".join([contexts[hit['corpus_id']] for hit in hits[0:top_k]]).replace("\n", " ")


	# Define your function to generate answers
	def generate_answer(question, context, title, ground):
	contexts = chunk_splitter(context)
	context = retrieve_context(question, contexts)

	# Combine question and context
	input_text = f"question: {question} context: {context}"

	# Tokenize the input text
	input_ids = tokenizer(
	input_text,
	return_tensors="pt",
	padding="max_length",
	truncation=True,
	max_length=max_length,
	).input_ids

	# Generate the answer
	with torch.no_grad():
	generated_ids = model.generate(input_ids=input_ids, max_new_tokens=max_target_length)

	# Decode and return the generated answer
	generated_answer = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

	return generated_answer, context


	# Define a function to list examples from the dataset
	def list_examples():
	examples = []
	for example in dataset:
	context = example["article"]
	question = example["question"]
	answer = example["answer"]
	title = example["title"]
	examples.append([question, context, title, answer])
	return examples


	# Create a Gradio interface
	iface = gr.Interface(
	fn=generate_answer,
	inputs=[
	Textbox(label="Question"),
	Textbox(label="Context"),
	Textbox(label="Article title"),
	Textbox(label="Ground truth")
	],
	outputs=[
	Textbox(label="Generated Answer"),
	Textbox(label="Retrieved Context")
	],
	examples=list_examples()
	)

	# Launch the Gradio interface
	iface.launch()