Spaces:

DevBM
/

QGen

Sleeping

App Files Files Community

DevBM commited on Jul 8, 2024

Commit

b38adec

verified ·

1 Parent(s): f174f61

using llama3 for option generation

Browse files

Files changed (1) hide show

app.py +164 -37

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import streamlit as st
 from transformers import T5ForConditionalGeneration, T5Tokenizer
 import spacy
 import nltk
 from sklearn.feature_extraction.text import TfidfVectorizer
@@ -11,7 +12,8 @@ from functools import lru_cache
 nltk.download('punkt')
 nltk.download('stopwords')
 nltk.download('brown')
-from nltk.tokenize import sent_tokenize
 nltk.download('wordnet')
 from nltk.corpus import wordnet
 import random
@@ -30,6 +32,8 @@ import uuid
 import time
 import asyncio
 import aiohttp
 print("***************************************************************")
 st.set_page_config(
@@ -84,7 +88,7 @@ def load_model(modelname):
 # Load Spacy Model
 @st.cache_resource
 def load_nlp_models():
-    nlp = spacy.load("en_core_web_md")
     s2v = sense2vec.Sense2Vec().from_disk('s2v_old')
     return nlp, s2v
@@ -97,6 +101,13 @@ def load_qa_models():
     spell = SpellChecker()
     return similarity_model, spell
 with st.sidebar:
     select_model = st.selectbox("Select Model", ("T5-large","T5-small"))
 if select_model == "T5-large":
@@ -106,7 +117,12 @@ elif select_model == "T5-small":
 nlp, s2v = load_nlp_models()
 similarity_model, spell = load_qa_models()
 context_model = similarity_model
 model, tokenizer = load_model(modelname)
 # Info Section
 def display_info():
     st.sidebar.title("Information")
@@ -251,7 +267,7 @@ def get_synonyms(word, n=3):
                     return synonyms
     return synonyms
-def generate_options(answer, context, n=3):
     options = [answer]
     # Add contextually relevant words using a pre-trained model
@@ -292,6 +308,142 @@ def generate_options(answer, context, n=3):
     return options
 # Function to map keywords to sentences with customizable context window size
 def map_keywords_to_sentences(text, keywords, context_window_size):
     sentences = sent_tokenize(text)
@@ -331,38 +483,8 @@ async def generate_question_async(context, answer, num_beams):
     except Exception as e:
         raise QuestionGenerationError(f"Error in question generation: {str(e)}")
-async def generate_options_async(answer, context, n=3):
-    try:
-        options = [answer]
-        # Add contextually relevant words using a pre-trained model
-        context_embedding = await asyncio.to_thread(context_model.encode, context)
-        answer_embedding = await asyncio.to_thread(context_model.encode, answer)
-        context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
-        # Compute similarity scores and sort context words
-        similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
-        sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
-        options.extend(sorted_context_words[:n])
-        # Try to get similar words based on sense2vec
-        similar_words = await asyncio.to_thread(get_similar_words_sense2vec, answer, n)
-        options.extend(similar_words)
-        # If we don't have enough options, try synonyms
-        if len(options) < n + 1:
-            synonyms = await asyncio.to_thread(get_synonyms, answer, n - len(options) + 1)
-            options.extend(synonyms)
-        # Ensure we have the correct number of unique options
-        options = list(dict.fromkeys(options))[:n+1]
-        # Shuffle the options
-        random.shuffle(options)
-        return options
-    except Exception as e:
-        raise QuestionGenerationError(f"Error in generating options: {str(e)}")
 # Function to generate questions using beam search
@@ -395,13 +517,16 @@ async def generate_questions_async(text, num_questions, context_window_size, num
         st.error(f"An unexpected error occurred: {str(e)}")
         return []
-async def process_batch(batch, keywords, context_window_size, num_beams):
     questions = []
     for text in batch:
         keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size)
         for keyword, context in keyword_sentence_mapping.items():
             question = await generate_question_async(context, keyword, num_beams)
-            options = await generate_options_async(keyword, context)
             overall_score, relevance_score, complexity_score, spelling_correctness = assess_question_quality(context, question, keyword)
             if overall_score >= 0.5:
                 questions.append({
@@ -477,6 +602,7 @@ def assess_question_quality(context, question, answer):
     return overall_score, relevance_score, complexity_score, spelling_correctness
 def main():
     # Streamlit interface
     st.title(":blue[Question Generator System]")
     session_id = get_session_id()
@@ -498,6 +624,7 @@ def main():
         num_beams = st.slider("Select number of beams for question generation", min_value=2, max_value=10, value=2)
         context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1)
         num_questions = st.slider("Select number of questions to generate", min_value=1, max_value=1000, value=5)
         col1, col2 = st.columns(2)
         with col1:
             extract_all_keywords = st.toggle("Extract Max Keywords",value=False)
@@ -518,14 +645,14 @@ def main():
     if text:
         text = clean_text(text)
     generate_questions_button = st.button("Generate Questions")
-    st.markdown('<span aria-label="Generate questions button">Above is the generate questions button</span>', unsafe_allow_html=True)
     # if generate_questions_button:
     if generate_questions_button and text:
         start_time = time.time()
         with st.spinner("Generating questions..."):
             try:
-                state['generated_questions'] = asyncio.run(generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords))
                 if not state['generated_questions']:
                     st.warning("No questions were generated. The text might be too short or lack suitable content.")
                 else:

 import streamlit as st
 from transformers import T5ForConditionalGeneration, T5Tokenizer
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, LlamaForCausalLM
 import spacy
 import nltk
 from sklearn.feature_extraction.text import TfidfVectorizer
 nltk.download('punkt')
 nltk.download('stopwords')
 nltk.download('brown')
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.tag import pos_tag
 nltk.download('wordnet')
 from nltk.corpus import wordnet
 import random
 import time
 import asyncio
 import aiohttp
+import torch
+from dotenv import load_dotenv
 print("***************************************************************")
 st.set_page_config(
 # Load Spacy Model
 @st.cache_resource
 def load_nlp_models():
+    nlp = spacy.load("en_core_web_lg")
     s2v = sense2vec.Sense2Vec().from_disk('s2v_old')
     return nlp, s2v
     spell = SpellChecker()
     return similarity_model, spell
+@st.cache_resource
+def load_llm_model():
+    model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = LlamaForCausalLM.from_pretrained(model_name,torch_dtype=torch.float16, device_map="auto")
+    return tokenizer, model
 with st.sidebar:
     select_model = st.selectbox("Select Model", ("T5-large","T5-small"))
 if select_model == "T5-large":
 nlp, s2v = load_nlp_models()
 similarity_model, spell = load_qa_models()
 context_model = similarity_model
+sentence_model = similarity_model
 model, tokenizer = load_model(modelname)
+# llm_tokenizer, llm_model = load_llm_model()
+llm_tokenizer, llm_model = "meta-llama/Meta-Llama-3-8B-Instruct", "meta-llama/Meta-Llama-3-8B-Instruct"
+pipe = pipeline("text-generation", model=llm_model, tokenizer=llm_tokenizer, max_new_tokens=200)
 # Info Section
 def display_info():
     st.sidebar.title("Information")
                     return synonyms
     return synonyms
+def get_fallback_options(answer, context, n=3):
     options = [answer]
     # Add contextually relevant words using a pre-trained model
     return options
+def get_semantic_similarity(word1, word2):
+    embeddings = sentence_model.encode([word1, word2])
+    return util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
+def ensure_grammatical_consistency(question, answer, option):
+    question_pos = pos_tag(word_tokenize(question))
+    answer_pos = pos_tag(word_tokenize(answer))
+    option_pos = pos_tag(word_tokenize(option))
+    # Check if the answer and option have the same part of speech
+    if answer_pos[-1][1] != option_pos[-1][1]:
+        return False
+    # Check if the option fits grammatically in the question
+    question_template = question.replace(answer, "PLACEHOLDER")
+    option_question = question_template.replace("PLACEHOLDER", option)
+    option_question_pos = pos_tag(word_tokenize(option_question))
+    return question_pos == option_question_pos
+def get_word_type(word):
+    doc = nlp(word)
+    return doc[0].pos_
+def generate_text_with_llama(prompt):
+    full_prompt = f"""[INST] {prompt} [/INST]"""
+    result = pipe(prompt, temperature=0.7, do_sample=True)[0]['generated_text']
+    # Extract the generated part after the prompt
+    # return result.split('[/INST]')[-1].strip()
+    return result
+async def generate_options_with_llm(answer, context, question, n=4):
+    prompt = f"""Given the following context, question, and correct answer, generate {n-1} incorrect but plausible answer options. The options should be:
+1. Contextually related to the given context
+2. Grammatically consistent with the question
+3. Different from the correct answer
+4. Not explicitly mentioned in the given context
+Context: {context}
+Question: {question}
+Correct Answer: {answer}
+Provide the options in a comma-separated list.
+"""
+    try:
+        response = await asyncio.to_thread(generate_text_with_llama, prompt)
+        options = [option.strip() for option in response.split(',')]
+        options = [option for option in options if option.lower() != answer.lower()]
+        print(f"\n\nLLM Options are: {options}\n\n")
+        return options[:n-1]  # Ensure we only return n-1 options
+    except Exception as e:
+        st.error(f"Error generating options with LLM: {e}")
+        return []
+async def generate_options_async(answer, context, question, n=4):
+    options = [answer]
+    # Generate options using the language model
+    llm_options = await generate_options_with_llm(answer, context, question, n)
+    options.extend(llm_options)
+    # If we don't have enough options, fall back to previous methods
+    if len(options) < n:
+        semantic_options = await generate_semantic_options(answer, context, question, n - len(options))
+        options.extend(semantic_options)
+    # If we still don't have enough options, use the fallback method
+    while len(options) < n:
+        fallback_options = await get_fallback_options(answer, context)
+        for option in fallback_options:
+            if option not in options and ensure_grammatical_consistency(question, answer, option):
+                options.append(option)
+                if len(options) == n:
+                    break
+    # Shuffle the options
+    random.shuffle(options)
+    return options
+async def generate_semantic_options(answer, context, question, n=4):
+    try:
+        options = [answer]
+        # Get context words
+        doc = nlp(context)
+        context_words = [token.text for token in doc if token.is_alpha and token.text.lower() != answer.lower()]
+        # Get answer type
+        answer_type = get_word_type(answer)
+        print(answer_type,"\n")
+        # Get semantically similar words
+        similar_words = []
+        for word in context_words:
+            if get_word_type(word) == answer_type:
+                similarity = get_semantic_similarity(answer, word)
+                if 0.2 < similarity < 0.8:  # Adjust these thresholds as needed
+                    similar_words.append((word, similarity))
+        # Sort by similarity (descending) and take top n-1
+        similar_words.sort(key=lambda x: x[1], reverse=True)
+        top_similar_words = [word for word, _ in similar_words[:n-1]]
+        # Ensure grammatical consistency
+        consistent_options = []
+        for word in top_similar_words:
+            if ensure_grammatical_consistency(question, answer, word):
+                consistent_options.append(word)
+            if len(consistent_options) == n-1:
+                break
+        options.extend(consistent_options)
+        # If we don't have enough options, fall back to original method
+        while len(options) < n:
+            fallback_options = get_fallback_options(answer, context, 3)
+            for option in fallback_options:
+                if option not in options and ensure_grammatical_consistency(question, answer, option):
+                    options.append(option)
+                    break
+        # Shuffle the options
+        random.shuffle(options)
+        print(options)
+        st.write("All possibel options are: ", options, "\n")
+        return options
+    except Exception as e:
+        raise QuestionGenerationError(f"Error in generating options: {str(e)}")
 # Function to map keywords to sentences with customizable context window size
 def map_keywords_to_sentences(text, keywords, context_window_size):
     sentences = sent_tokenize(text)
     except Exception as e:
         raise QuestionGenerationError(f"Error in question generation: {str(e)}")
 # Function to generate questions using beam search
         st.error(f"An unexpected error occurred: {str(e)}")
         return []
+async def process_batch(batch, keywords, context_window_size, num_beams, use_llm_options):
     questions = []
     for text in batch:
         keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size)
         for keyword, context in keyword_sentence_mapping.items():
             question = await generate_question_async(context, keyword, num_beams)
+            if use_llm_options:
+                options = await generate_options_async(keyword, context, question)
+            else:
+                options =await generate_semantic_options(keyword, context, question)
             overall_score, relevance_score, complexity_score, spelling_correctness = assess_question_quality(context, question, keyword)
             if overall_score >= 0.5:
                 questions.append({
     return overall_score, relevance_score, complexity_score, spelling_correctness
 def main():
+    load_dotenv()
     # Streamlit interface
     st.title(":blue[Question Generator System]")
     session_id = get_session_id()
         num_beams = st.slider("Select number of beams for question generation", min_value=2, max_value=10, value=2)
         context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1)
         num_questions = st.slider("Select number of questions to generate", min_value=1, max_value=1000, value=5)
+        use_llm_for_options = st.toggle("Use AI for Advanced option generation", False)
         col1, col2 = st.columns(2)
         with col1:
             extract_all_keywords = st.toggle("Extract Max Keywords",value=False)
     if text:
         text = clean_text(text)
     generate_questions_button = st.button("Generate Questions")
+    # st.markdown('<span aria-label="Generate questions button">Above is the generate questions button</span>', unsafe_allow_html=True)
     # if generate_questions_button:
     if generate_questions_button and text:
         start_time = time.time()
         with st.spinner("Generating questions..."):
             try:
+                state['generated_questions'] = asyncio.run(generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords, use_llm_for_options))
                 if not state['generated_questions']:
                     st.warning("No questions were generated. The text might be too short or lack suitable content.")
                 else: