Spaces:

girishwangikar
/

RAG_Document_QA

Running

App Files Files Community

girishwangikar commited on Sep 5, 2024

Commit

321120c

verified ·

1 Parent(s): e729802

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -47

app.py CHANGED Viewed

@@ -1,100 +1,119 @@
 import os
 import gradio as gr
-from transformers import pipeline
-from sentence_transformers import SentenceTransformer
-from pypdf import PdfReader
-from sklearn.metrics.pairwise import cosine_similarity
-import numpy as np
 from dotenv import load_dotenv
 load_dotenv()
 GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
-# Initialize models
-qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
-embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-# Global variable to store the document chunks and their embeddings
-document_store = []
-def clear_knowledge_base():
-    global document_store
-    document_store = []
-    return "Knowledge base cleared."
 def process_pdf(file):
-    global document_store
     if file is not None:
-        reader = PdfReader(file.name)
-        text = ""
-        for page in reader.pages:
-            text += page.extract_text() + "\n"
-        # Simple text splitting (you might want to implement a more sophisticated method)
-        chunks = [text[i:i+1000] for i in range(0, len(text), 900)]
-        document_store = [(chunk, embedding_model.encode(chunk)) for chunk in chunks]
-        return f"PDF processed. {len(chunks)} chunks added to the knowledge base."
     return "No file uploaded."
 def process_question(question):
-    global document_store
-    if not document_store:
         return "Please upload a PDF first.", "", 0
-    question_embedding = embedding_model.encode(question)
-    # Find the most relevant chunks
-    similarities = [cosine_similarity([question_embedding], [doc_embedding])[0][0] for _, doc_embedding in document_store]
-    top_chunk_indices = np.argsort(similarities)[-3:][::-1]  # Get top 3 most similar chunks
-    context = "\n".join([document_store[i][0] for i in top_chunk_indices])
-    # Use the QA model to get the answer
-    qa_result = qa_model(question=question, context=context)
-    answer = qa_result['answer']
-    confidence_score = qa_result['score']
-    return answer, context, round(confidence_score, 2)
 CSS = """
 .duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important;}
 h3, p, h1 { text-align: center; color: white;}
 footer { text-align: center; padding: 10px; width: 100%; background-color: rgba(240, 240, 240, 0.8); z-index: 1000; position: relative; margin-top: 10px; color: black;}
 """
 FOOTER_TEXT = """
 <footer>
     <p>If you enjoyed the functionality of the app, please leave a like!<br>
-    Check out more on <a href="https://www.linkedin.com/in/your-linkedin/" target="_blank">LinkedIn</a> | <a href="https://your-portfolio-url.com/" target="_blank">Portfolio</a></p>
 </footer>
 """
 TITLE = "<h1>📚 RAG Document Q&A 📚</h1>"
 with gr.Blocks(css=CSS, theme="Nymbo/Nymbo_Theme") as demo:
     gr.HTML(TITLE)
     with gr.Tab("PDF Uploader"):
         pdf_file = gr.File(label="Upload PDF")
         upload_button = gr.Button("Process PDF")
         upload_output = gr.Textbox(label="Upload Status")
-        clear_button = gr.Button("Clear Knowledge Base")
-        clear_output = gr.Textbox(label="Clear Status")
     with gr.Tab("Q&A System"):
         question_input = gr.Textbox(lines=2, placeholder="Enter your question here...")
         submit_button = gr.Button("Ask Question")
         answer_output = gr.Textbox(label="Answer")
         context_output = gr.Textbox(label="Relevant Context", lines=10)
         confidence_output = gr.Number(label="Confidence Score")
     upload_button.click(process_pdf, inputs=[pdf_file], outputs=[upload_output])
-    clear_button.click(clear_knowledge_base, outputs=[clear_output])
     submit_button.click(process_question, inputs=[question_input], outputs=[answer_output, context_output, confidence_output])
     gr.HTML(FOOTER_TEXT)
 if __name__ == "__main__":
-    demo.launch()

 import os
 import gradio as gr
+from langchain_groq import ChatGroq
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains import create_retrieval_chain
+from langchain_community.vectorstores import FAISS
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.embeddings import HuggingFaceEmbeddings
 from dotenv import load_dotenv
+# Load environment variables
 load_dotenv()
+# Load the GROQ API key
 GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
+# Set up the language model
+llm = ChatGroq(temperature=0, model_name='llama-3.1-8b-instant', groq_api_key=GROQ_API_KEY)
+# Define the prompt template
+prompt = ChatPromptTemplate.from_template("""
+Answer the questions based on the provided context only.
+Please provide the most accurate response based on the question.
+<context>{context}</context>
+Question: {input}
+""")
+# Set up embeddings model
+embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+vectors = None
+# Function to process PDF files
 def process_pdf(file):
+    global vectors
     if file is not None:
+        loader = PyPDFLoader(file.name)
+        docs = loader.load()
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+        final_documents = text_splitter.split_documents(docs)
+        if vectors is None:
+            vectors = FAISS.from_documents(final_documents, embeddings)
+        else:
+            vectors.add_documents(final_documents)
+        return "PDF processed and added to the knowledge base."
     return "No file uploaded."
+# Function to clear the knowledge base
+def clear_knowledge_base():
+    global vectors
+    vectors = None
+    return "Knowledge base cleared."
+# Function to process questions
 def process_question(question):
+    if vectors is None:
         return "Please upload a PDF first.", "", 0
+    document_chain = create_stuff_documents_chain(llm, prompt)
+    retriever = vectors.as_retriever()
+    retrieval_chain = create_retrieval_chain(retriever, document_chain)
+    response = retrieval_chain.invoke({'input': question})
+    context = "\n\n".join([doc.page_content for doc in response["context"]])
+    # Calculate a confidence score based on the relevance of retrieved documents
+    confidence_score = sum([doc.metadata.get('score', 0) for doc in response["context"]]) / len(response["context"])
+    return response['answer'], context, round(confidence_score, 2)
+# CSS styling
 CSS = """
 .duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important;}
 h3, p, h1 { text-align: center; color: white;}
 footer { text-align: center; padding: 10px; width: 100%; background-color: rgba(240, 240, 240, 0.8); z-index: 1000; position: relative; margin-top: 10px; color: black;}
 """
+# Footer text
 FOOTER_TEXT = """
 <footer>
     <p>If you enjoyed the functionality of the app, please leave a like!<br>
+    Check out more on <a href="https://www.linkedin.com/in/your-linkedin/" target="_blank">LinkedIn</a> |
+    <a href="https://your-portfolio-url.com/" target="_blank">Portfolio</a></p>
 </footer>
 """
+# Title text
 TITLE = "<h1>📚 RAG Document Q&A 📚</h1>"
+# Gradio interface
 with gr.Blocks(css=CSS, theme="Nymbo/Nymbo_Theme") as demo:
     gr.HTML(TITLE)
     with gr.Tab("PDF Uploader"):
         pdf_file = gr.File(label="Upload PDF")
         upload_button = gr.Button("Process PDF")
+        clear_button = gr.Button("Clear Knowledge Base")
         upload_output = gr.Textbox(label="Upload Status")
     with gr.Tab("Q&A System"):
         question_input = gr.Textbox(lines=2, placeholder="Enter your question here...")
         submit_button = gr.Button("Ask Question")
         answer_output = gr.Textbox(label="Answer")
         context_output = gr.Textbox(label="Relevant Context", lines=10)
         confidence_output = gr.Number(label="Confidence Score")
+    # Button actions
     upload_button.click(process_pdf, inputs=[pdf_file], outputs=[upload_output])
     submit_button.click(process_question, inputs=[question_input], outputs=[answer_output, context_output, confidence_output])
+    # Action to clear the knowledge base
+    clear_button.click(clear_knowledge_base, outputs=[upload_output])
     gr.HTML(FOOTER_TEXT)
+# Launch the Gradio app
 if __name__ == "__main__":
+    demo.launch()