QA-ContextRetriever

Sleeping

App Files Files Community

Muhammad Adnan commited on Nov 6, 2024

Commit

ae6eb20

1 Parent(s): 3623388

Initial commit of Streamlit app

Browse files

Files changed (4) hide show

app.py +147 -0
data_ret.py +57 -0
requirements.txt +8 -0
similarity_search.py +94 -0

app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import streamlit as st
+from transformers import pipeline
+from similarity_search import get_relevant_context  # Import function from similarity_search.py
+from bs4 import BeautifulSoup  # For stripping HTML/XML tags
+import spacy  # Import spaCy for NLP tasks
+# Load the spaCy model (make sure to download it first via 'python -m spacy download en_core_web_sm')
+nlp = spacy.load("en_core_web_sm")
+# Load the Roberta model for question answering
+def load_qa_model():
+    print("Loading QA model...")
+    try:
+        qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
+        print("QA model loaded.")
+        return qa_model
+    except Exception as e:
+        print(f"Error loading QA model: {e}")
+        raise RuntimeError("Failed to load the QA model.")
+# Function to clean the context text (remove HTML tags and optional stop words)
+def clean_text(context, remove_stop_words=False):
+    # Remove HTML/XML tags
+    clean_context = BeautifulSoup(context, "html.parser").get_text()
+    if remove_stop_words:
+        stop_words = set(["the", "a", "an", "of", "and", "to", "in", "for", "on", "at", "by", "with", "about", "as", "from"])
+        clean_context = " ".join([word for word in clean_context.split() if word.lower() not in stop_words])
+    return clean_context
+# Function to extract proper nouns or pronouns from the question for context retrieval
+def extract_topic_from_question(question):
+    # Process the text with spaCy
+    doc = nlp(question)
+    # Define pronouns to exclude manually if necessary
+    excluded_pronouns = ['I', 'you', 'he', 'she', 'it', 'they', 'we', 'them', 'this', 'that', 'these', 'those']
+    # Extract proper nouns (PROPN) and pronouns (PRON), but exclude certain pronouns and stopwords
+    proper_nouns_or_pronouns = [
+        token.text for token in doc
+        if (
+            token.pos_ == 'PROPN' or token.pos_ == 'PRON') and token.text.lower() not in excluded_pronouns and not token.is_stop
+    ]
+    # If no proper nouns or pronouns are found, remove stopwords and return whatever is left
+    if not proper_nouns_or_pronouns:
+        remaining_tokens = [
+            token.text for token in doc
+            if not token.is_stop  # Just remove stopwords, keep all other tokens
+        ]
+        return " ".join(remaining_tokens)
+    # Otherwise, return the proper nouns or pronouns
+    return " ".join(proper_nouns_or_pronouns)
+# Inside the answer_question_with_context function, add debugging statements:
+def answer_question_with_context(question, qa_model):
+    try:
+        print(question)
+        # Extract topic from question (proper nouns or pronouns)
+        topic = extract_topic_from_question(question)
+        print(f"Extracted topic (proper nouns or pronouns): {topic}" if topic else "No proper nouns or pronouns extracted.")
+        # Retrieve relevant context based on the extracted topic
+        context = get_relevant_context(question, topic)
+        print(f"Retrieved Context: {context}")  # Debug: Show context result
+        if not context.strip():
+            return "No context found for answering.", ""
+        # Clean the context
+        context = clean_text(context, remove_stop_words=True)
+        # Use the QA model to extract an answer from the context
+        result = qa_model(question=question, context=context)
+        return result.get('answer', 'No answer found.'), context
+    except Exception as e:
+        print(f"Error during question answering: {e}")  # Debug: Log error in terminal
+        return f"Error during question answering: {e}", ""
+# Streamlit UI
+def main():
+    st.title("RAG Question Answering with Context Retrieval")
+    # User input for the question
+    question = st.text_input("Enter your question:", "What is the capital of Italy?")  # Default question
+    # Display a log update
+    log = st.empty()
+    # Button to get the answer
+    if st.button("Get Answer"):
+        if not question:
+            st.error("Please provide a question.")
+        else:
+            try:
+                # Display a loading spinner and log message for the QA model
+                log.text("Loading QA model...")
+                with st.spinner("Loading QA model... Please wait."):
+                    # Try loading the QA model
+                    qa_model = load_qa_model()
+                # Display log message for context retrieval
+                log.text("Retrieving context...")
+                with st.spinner("Retrieving context..."):
+                    answer, context = answer_question_with_context(question, qa_model)
+                if not context.strip():
+                    # If context is empty, let the user enter the context manually
+                    st.warning("I couldn't find any relevant context for this question. Please enter it below:")
+                    context = st.text_area("Enter your context here:", "", height=200, max_chars=1000)
+                    if not context.strip():
+                        context = "I couldn't find any relevant context, and you didn't provide one either. Maybe next time!"
+                # Display the answer and context
+                st.subheader("Answer:")
+                st.write(answer)  # Show the final answer
+                # Display the context
+                st.subheader("Context Used for Answering:")
+                st.text_area("Context:", context, height=200, max_chars=1000, key="context_input", disabled=False)  # Editable context box
+            except Exception as e:
+                st.error(f"An error occurred: {e}")
+                log.text(f"Error: {e}")  # Log error in place
+    # Display information about the application
+    st.markdown("""
+    ### About the Application
+    This is a **Retrieval-Augmented Generation (RAG)** application that answers questions by dynamically retrieving context from a dataset. Here's how it works:
+    1. **Dynamic Topic Extraction**: The application analyzes the user's question and extracts key topics (such as proper nouns or pronouns) to understand the context of the query.
+    2. **Context Retrieval**: Based on the extracted topic, the app searches for the most relevant documents (a few hundred) in the dataset.
+    3. **Answer Generation**: Using the retrieved context, an AI model (like RoBERTa) is used to generate the most accurate answer possible. The model combines the context with its internal knowledge to provide a robust and informed response.
+    4. **Customization**: If the application doesn't find enough relevant context automatically, you can manually input additional context to improve the answer.
+    The application leverages **Roberta-based question-answering models** to generate answers based on the context retrieved. This helps provide more accurate, context-specific answers compared to traditional approaches that rely solely on pre-trained model knowledge.
+    **Dataset Used**: The application dynamically pulls relevant documents from a dataset (e.g., academic papers, FAQ pages, product manuals, etc.), helping answer user questions more effectively.
+    """)
+if __name__ == "__main__":
+    main()

data_ret.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from datasets import load_dataset
+# Load the dataset (specify split as 'train' to load the training data)
+dataset = load_dataset('tom-010/google_natural_questions_answerability', split='train')
+# Function to filter based on a query/topic and return relevant data
+def search_relevant_data(topic="Artificial Intelligence", max_words=100, top_n=100):
+    # Filter the dataset based on the presence of the topic in 'question', 'answer', or 'text' fields
+    filtered_data = dataset.filter(
+        lambda x: (
+            (x['question'] is not None and topic.lower() in x['question'].lower()) or
+            (x['answer'] is not None and topic.lower() in x['answer'].lower()) or
+            (x['text'] is not None and topic.lower() in x['text'].lower())
+        )
+    )
+    # Ensure we only select up to the available number of rows
+    #num_to_select = min(top_n, len(filtered_data))  # Choose the minimum of top_n and available data
+    #filtered_data = filtered_data.select(range(num_to_select))  # Select up to 'num_to_select' rows
+    filtered_data = filtered_data.select(range(min(top_n, len(filtered_data))))
+    # Create a list to store the relevant data
+    relevant_documents = []
+    # Display and store an excerpt of the answer for each relevant entry
+    for entry in filtered_data:
+        # Check the type of 'entry' first to ensure it's a dictionary
+        if isinstance(entry, dict):
+            question = entry.get('question', '')  # Accessing the 'question' field safely
+            answer = entry.get('answer', '')  # Accessing the 'answer' field safely
+            text = entry.get('text', '')  # Accessing the 'text' field safely
+            # Only store the first 'max_words' words of the answer or text
+            answer_excerpt = ' '.join(answer.split()[:max_words]) if answer else ""
+            text_excerpt = ' '.join(text.split()[:max_words]) if text else ""
+            # Append relevant document information to the list
+            relevant_documents.append({
+                "question": question,
+                "answer": answer_excerpt,
+                "text": text_excerpt
+            })
+            # Debugging: Print a preview of the data (optional)
+            #print(f"Question: {question[:20]}...")  # Print first 20 chars of the question
+            #print(f"Answer (first {max_words} words): {answer_excerpt[:20]}...")  # Print first 20 words of the answer
+            #print(f"Text (first {max_words} words): {text_excerpt[:20]}...")  # Print first 20 words of the text
+            #print("-" * 50)
+        else:
+            print("Unexpected entry format:", entry)
+    return relevant_documents  # Return the list of relevant documents
+# Sample search query
+#relevant_data = search_relevant_data("vatican city")  # Change to the desired query/topic
+#print(f"Found {len(relevant_data)} relevant documents.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit==1.20.0
+transformers==4.33.0
+sentence-transformers==2.2.0
+scipy==1.10.0
+numpy==1.24.2
+datasets==2.9.0
+beautifulsoup4==4.12.0
+spacy==3.5.0

similarity_search.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from sentence_transformers import SentenceTransformer
+from scipy.spatial.distance import cosine
+import numpy as np
+from data_ret import search_relevant_data  # Assuming this function fetches the data from some source
+import streamlit as st
+# Load the Sentence Transformer model for similarity search
+def load_similarity_model():
+    st.write("Loading similarity model...")  # Show status on Streamlit
+    retriever_model = SentenceTransformer("all-mpnet-base-v2")
+    st.write("Similarity model loaded.")
+    return retriever_model
+# Create embeddings for the retrieved documents
+def create_embeddings(documents, model):
+    if not documents:
+        st.write("No documents provided for embedding.")
+        return np.array([])  # Return empty array if no documents
+    st.write(f"Creating embeddings for {len(documents)} documents...")  # Show progress
+    embeddings = []
+    # Track progress of the embedding creation using Streamlit's progress bar
+    progress_bar = st.progress(0)
+    step = 1 / len(documents)  # This ensures the progress bar value stays within [0.0, 1.0]
+    # Include 'text' in the document text along with 'question' and 'answer'
+    document_texts = [doc['question'] + " " + doc['answer'] + " " + doc.get('text', '') for doc in documents]
+    for i, doc_text in enumerate(document_texts):
+        embedding = model.encode(doc_text)
+        embeddings.append(embedding)
+        progress_bar.progress(i * step)  # Update the progress bar within valid range
+    embeddings = np.array(embeddings)
+    st.write(f"Embeddings created with shape: {embeddings.shape}")
+    return embeddings
+# Retrieve documents based on the question embedding
+def retrieve_documents(question_embedding, document_embeddings, top_k=5):
+    if document_embeddings.size == 0:
+        st.write("No document embeddings available for retrieval.")
+        return []
+    st.write("Calculating similarities between question and documents...")
+    similarities = np.array([1 - cosine(question_embedding, doc_embedding) for doc_embedding in document_embeddings])
+    # Get indices of top K similarities (highest similarity first)
+    top_indices = similarities.argsort()[-top_k:][::-1]  # Sort in descending order
+    return top_indices
+# Main function to get the context from the most relevant documents based on topic and question
+def get_relevant_context(question, topic):
+    try:
+        st.write("Searching for relevant documents based on the topic...")
+        relevant_documents = search_relevant_data(topic)  # Use dynamic topic for search query
+        st.write(f"Found {len(relevant_documents)} relevant documents.")
+        if not relevant_documents:
+            return "No relevant documents found."
+        retriever_model = load_similarity_model()  # Load the similarity model
+        # Create document embeddings and show progress
+        document_embeddings = create_embeddings(relevant_documents, retriever_model)
+        if document_embeddings.size == 0:
+            return "No embeddings created for relevant documents."
+        st.write("Generating question embedding and retrieving relevant documents...")
+        question_embedding = retriever_model.encode(question)
+        relevant_doc_indices = retrieve_documents(question_embedding, document_embeddings)
+        if len(relevant_doc_indices) == 0:
+            return "No relevant documents found after embedding."
+        # Extract context from the top relevant documents
+        contexts = []
+        for idx in relevant_doc_indices:
+            doc = relevant_documents[idx]
+            context = doc.get('answer', '') + " " + doc.get('text', '')
+            if context.strip():
+                contexts.append(context)
+        if not contexts:
+            return "No valid contexts available for answering."
+        # Return the combined context for question answering
+        return " ".join(contexts)
+    except Exception as e:
+        st.write(f"Error processing question: {str(e)}")
+        return f"Error: {str(e)}"