Spaces:

girishwangikar
/

RAG_Document_QA

Running

App Files Files Community

girishwangikar commited on Sep 5, 2024

Commit

e729802

verified ·

1 Parent(s): d25a56b

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -48

app.py CHANGED Viewed

@@ -1,71 +1,62 @@
 import os
 import gradio as gr
-from langchain_groq import ChatGroq
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.chains.combine_documents import create_stuff_documents_chain
-from langchain_core.prompts import ChatPromptTemplate
-from langchain.chains import create_retrieval_chain
-from langchain_community.vectorstores import FAISS
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_community.embeddings import HuggingFaceEmbeddings
 from dotenv import load_dotenv
-from pydantic import ConfigDict
-load_dotenv()  # Load the GROQ API KEY
 GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
-# Configure Pydantic to allow arbitrary types
-config = ConfigDict(arbitrary_types_allowed=True)
-llm = ChatGroq(
-    temperature=0,
-    model_name='llama-3.1-8b-instant',
-    groq_api_key=GROQ_API_KEY,
-    model_config=config
-)
-prompt = ChatPromptTemplate.from_template("""
-Answer the questions based on the provided context only.
-Please provide the most accurate response based on the question
-<context>{context}</context>
-Question: {input}
-""")
-embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
-# Global variable to store the vector store
-vectors = None
 def clear_knowledge_base():
-    global vectors
-    vectors = None
     return "Knowledge base cleared."
 def process_pdf(file):
-    global vectors
     if file is not None:
-        loader = PyPDFLoader(file.name)
-        docs = loader.load()
-        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-        final_documents = text_splitter.split_documents(docs)
-        vectors = FAISS.from_documents(final_documents, embeddings)
-        return "PDF processed and added to the knowledge base."
     return "No file uploaded."
 def process_question(question):
-    global vectors
-    if vectors is None:
         return "Please upload a PDF first.", "", 0
-    document_chain = create_stuff_documents_chain(llm, prompt)
-    retriever = vectors.as_retriever()
-    retrieval_chain = create_retrieval_chain(retriever, document_chain)
-    response = retrieval_chain.invoke({'input': question})
-    context = "\n\n".join([doc.page_content for doc in response["context"]])
-    confidence_score = sum([doc.metadata.get('score', 0) for doc in response["context"]]) / len(response["context"])
-    return response['answer'], context, round(confidence_score, 2)
 CSS = """
 .duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important;}

 import os
 import gradio as gr
+from transformers import pipeline
+from sentence_transformers import SentenceTransformer
+from pypdf import PdfReader
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
 from dotenv import load_dotenv
+load_dotenv()
 GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
+# Initialize models
+qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
+embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+# Global variable to store the document chunks and their embeddings
+document_store = []
 def clear_knowledge_base():
+    global document_store
+    document_store = []
     return "Knowledge base cleared."
 def process_pdf(file):
+    global document_store
     if file is not None:
+        reader = PdfReader(file.name)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text() + "\n"
+        # Simple text splitting (you might want to implement a more sophisticated method)
+        chunks = [text[i:i+1000] for i in range(0, len(text), 900)]
+        document_store = [(chunk, embedding_model.encode(chunk)) for chunk in chunks]
+        return f"PDF processed. {len(chunks)} chunks added to the knowledge base."
     return "No file uploaded."
 def process_question(question):
+    global document_store
+    if not document_store:
         return "Please upload a PDF first.", "", 0
+    question_embedding = embedding_model.encode(question)
+    # Find the most relevant chunks
+    similarities = [cosine_similarity([question_embedding], [doc_embedding])[0][0] for _, doc_embedding in document_store]
+    top_chunk_indices = np.argsort(similarities)[-3:][::-1]  # Get top 3 most similar chunks
+    context = "\n".join([document_store[i][0] for i in top_chunk_indices])
+    # Use the QA model to get the answer
+    qa_result = qa_model(question=question, context=context)
+    answer = qa_result['answer']
+    confidence_score = qa_result['score']
+    return answer, context, round(confidence_score, 2)
 CSS = """
 .duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important;}