Spaces:

DevBM
/

QGen

Running

App Files Files Community

QGen / app.py

DevBM

Added Option generation, modified session continuatio after downlaoding

297bd17 verified about 1 year ago

raw

history blame

8.75 kB

	import streamlit as st
	from transformers import T5ForConditionalGeneration, T5Tokenizer
	import spacy
	import nltk
	from sklearn.feature_extraction.text import TfidfVectorizer
	from rake_nltk import Rake
	import pandas as pd
	from fpdf import FPDF
	import wikipediaapi
	from functools import lru_cache
	nltk.download('punkt')
	nltk.download('stopwords')
	nltk.download('brown')
	from nltk.tokenize import sent_tokenize
	nltk.download('wordnet')
	from gensim.models import KeyedVectors
	from nltk.corpus import wordnet
	import random

	# Load spaCy model
	nlp = spacy.load("en_core_web_sm")

	# Initialize Wikipedia API with a user agent
	user_agent = 'QGen/1.0 ([email protected])'
	wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')

	# Load pre-trained word vectors (this may take a while)
	word_vectors = KeyedVectors.load_word2vec_format('vectors/GoogleNews-vectors-negative300.bin', binary=True)

	def load_model():
	model_name = "DevBM/t5-large-squad"
	model = T5ForConditionalGeneration.from_pretrained(model_name)
	tokenizer = T5Tokenizer.from_pretrained(model_name)
	return model, tokenizer

	# Initialize session state for model and tokenizer
	if 'model' not in st.session_state:
	st.session_state.model, st.session_state.tokenizer = load_model()

	# Use the model and tokenizer from session state
	model = st.session_state.model
	tokenizer = st.session_state.tokenizer

	# Function to extract keywords using combined techniques
	def extract_keywords(text):
	# Use RAKE
	rake = Rake()
	rake.extract_keywords_from_text(text)
	rake_keywords = set(rake.get_ranked_phrases())

	# Use spaCy for NER and POS tagging
	doc = nlp(text)
	spacy_keywords = set([ent.text for ent in doc.ents])
	spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]])

	# Use TF-IDF
	vectorizer = TfidfVectorizer(stop_words='english')
	X = vectorizer.fit_transform([text])
	tfidf_keywords = set(vectorizer.get_feature_names_out())

	# Combine all keywords
	combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords)

	return list(combined_keywords)

	# Function to map keywords to sentences with customizable context window size
	def map_keywords_to_sentences(text, keywords, context_window_size):
	sentences = sent_tokenize(text)
	keyword_sentence_mapping = {}
	for keyword in keywords:
	for i, sentence in enumerate(sentences):
	if keyword in sentence:
	# Combine current sentence with surrounding sentences for context
	start = max(0, i - context_window_size)
	end = min(len(sentences), i + context_window_size + 1)
	context = ' '.join(sentences[start:end])
	if keyword not in keyword_sentence_mapping:
	keyword_sentence_mapping[keyword] = context
	else:
	keyword_sentence_mapping[keyword] += ' ' + context
	return keyword_sentence_mapping

	def get_similar_words(word, n=3):
	try:
	similar_words = word_vectors.most_similar(word, topn=n)
	return [word for word, _ in similar_words]
	except KeyError:
	return []

	def get_synonyms(word, n=3):
	synonyms = []
	for syn in wordnet.synsets(word):
	for lemma in syn.lemmas():
	if lemma.name() != word and lemma.name() not in synonyms:
	synonyms.append(lemma.name())
	if len(synonyms) == n:
	return synonyms
	return synonyms

	def generate_options(answer, context, n=3):
	options = [answer]

	# Try to get similar words based on word embeddings
	similar_words = get_similar_words(answer, n)
	options.extend(similar_words)

	# If we don't have enough options, try synonyms
	if len(options) < n + 1:
	synonyms = get_synonyms(answer, n - len(options) + 1)
	options.extend(synonyms)

	# If we still don't have enough options, extract other entities from the context
	if len(options) < n + 1:
	doc = nlp(context)
	entities = [ent.text for ent in doc.ents if ent.text.lower() != answer.lower()]
	options.extend(entities[:n - len(options) + 1])

	# If we still need more options, add some random words from the context
	if len(options) < n + 1:
	context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
	options.extend(random.sample(context_words, min(n - len(options) + 1, len(context_words))))

	# Ensure we have the correct number of unique options
	options = list(dict.fromkeys(options))[:n+1]

	# Shuffle the options
	random.shuffle(options)

	return options

	# Function to perform entity linking using Wikipedia API
	@lru_cache(maxsize=128)
	def entity_linking(keyword):
	page = wiki_wiki.page(keyword)
	if page.exists():
	return page.fullurl
	return None

	# Function to generate questions using beam search
	def generate_question(context, answer, num_beams):
	input_text = f"<context> {context} <answer> {answer}"
	input_ids = tokenizer.encode(input_text, return_tensors='pt')
	outputs = model.generate(input_ids, num_beams=num_beams, early_stopping=True)
	question = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return question

	# Function to export questions to CSV
	def export_to_csv(data):
	df = pd.DataFrame(data, columns=["Context", "Answer", "Question"])
	csv = df.to_csv(index=False,encoding='utf-8')
	return csv

	# Function to export questions to PDF
	def export_to_pdf(data):
	pdf = FPDF()
	pdf.add_page()
	pdf.set_font("Arial", size=12)

	for context, answer, question in data:
	pdf.multi_cell(0, 10, f"Context: {context}")
	pdf.multi_cell(0, 10, f"Answer: {answer}")
	pdf.multi_cell(0, 10, f"Question: {question}")
	pdf.ln(10)

	# pdf.output("questions.pdf")
	return pdf.output(name='questions.pdf',dest='S').encode('latin1')

	if 'data' not in st.session_state:
	st.session_state.data = None

	# Streamlit interface
	st.title(":blue[Question Generator from Text]")
	text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.")

	with st.sidebar:
	st.subheader("Customization Options")
	# Customization options
	num_beams = st.slider("Select number of beams for question generation", min_value=1, max_value=10, value=5)
	context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1)
	num_questions = st.slider("Select number of questions to generate", min_value=1, max_value=1000, value=5)
	question_complexity = st.selectbox("Select question complexity", ["Simple", "Intermediate", "Complex"])
	if st.button("Generate Questions"):
	if text:
	load_model()
	keywords = extract_keywords(text)
	keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size)

	st.subheader("Generated Questions:")
	data = []
	for i, (keyword, context) in enumerate(keyword_sentence_mapping.items()):
	if i >= num_questions:
	break
	linked_entity = entity_linking(keyword)
	question = generate_question(context, keyword, num_beams=num_beams)
	options = generate_options(keyword, context)

	st.write(f"Context: {context}")
	st.write(f"Answer: {keyword}")
	st.write(f"Question: {question}")
	st.write(f"Options:")
	for j, option in options:
	st.write(f"{chr(65+j)}. {option}")

	if linked_entity:
	st.write(f"Entity Link: {linked_entity}")
	st.write("---")
	data.append((context, keyword, question))

	# Add the data to session state
	st.session_state.data = data

	# Export buttons
	if st.session_state.data is not None:
	with st.sidebar:
	st.subheader('Download Content')
	csv_data = export_to_csv(data)
	st.download_button(label="CSV Format", data=csv_data, file_name='questions.csv', mime='text/csv')

	pdf_data = export_to_pdf(data)
	st.download_button(label="PDF Format", data=pdf_data, file_name='questions.pdf', mime='application/pdf')
	if st.session_state.data is not None:
	st.markdown("You can download the data from the sidebar.")


	else:
	st.write("Please enter some text to generate questions.")