Spaces:

ajoy0071998
/

PDF_Query_System

Running

App Files Files Community

ajoy0071998 commited on May 22

Commit

6d9f20a

verified ·

1 Parent(s): dbce58a

Upload 5 files

Browse files

Files changed (5) hide show

.env +1 -0
app.py +53 -0
chain.py +42 -0
embeddings.py +50 -0
response.py +28 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ MISTRAL_API_KEY="VblmlGe1ROVcFpraMchfapRp4QvFacqf"

app.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import streamlit as st
+import os
+import shutil
+from chain import extract_pdf_with_user, get_final_aswer
+UPLOAD_DIR = "uploads"
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+st.title("PDF Extraction and Question Answering")
+# --- Step 1: Upload PDF and extract ---
+st.header("Upload PDF and extract text")
+user_id = st.text_input("User ID")
+name = st.text_input("Save PDF name (without extension)")
+uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
+if st.button("Extract PDF"):
+    if not user_id or not name or not uploaded_file:
+        st.error("Please provide User ID, name, and upload a PDF file.")
+    else:
+        # Save uploaded PDF
+        upload_path = os.path.join(UPLOAD_DIR, uploaded_file.name)
+        with open(upload_path, "wb") as f:
+            shutil.copyfileobj(uploaded_file, f)
+        # Call your extraction function
+        documents = extract_pdf_with_user(user_id, upload_path, name)
+        st.success("PDF processed!")
+        st.write("Extracted documents:")
+        st.write(documents)
+# --- Step 2: Ask a question ---
+st.header("Ask a question")
+query = st.text_input("Enter your question")
+if st.button("Get Answer"):
+    if not user_id or not name or not query:
+        st.error("Please provide User ID, name, and your question.")
+    else:
+        pdf_path = os.path.join(UPLOAD_DIR, f"{name}.pdf")
+        if not os.path.exists(pdf_path):
+            st.error(f"No PDF found with name '{name}.pdf'. Please upload and extract first.")
+        else:
+            answer = get_final_aswer(query, user_id, pdf_path)
+            st.success("Answer generated!")
+            st.write(answer)

chain.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from embeddings import set_embedding
+from embeddings import get_chunks
+from user_data import user_doc,get_docs_by_user
+from response import get_answer
+from PyPDF2 import PdfReader
+import os
+import shutil
+def extract_pdf_with_user(user_id: str,pdf_path: str, name: str) -> tuple:
+    # Set new PDF path based on provided name
+    dir_path = os.path.dirname(pdf_path)
+    new_pdf_path = os.path.join(dir_path, f"{name}.pdf")
+    if pdf_path != new_pdf_path:
+        shutil.copy(pdf_path, new_pdf_path)
+    reader = PdfReader(new_pdf_path)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text() or ""
+    set_embedding(text,new_pdf_path,user_id)
+    user_doc(user_id, new_pdf_path)
+    return get_docs_by_user(user_id)
+    ##return (user_id, text,pdf_path)
+def get_final_aswer(query: str, user_id: str,pdf_path: str) -> list:
+    chunks = get_chunks(query,user_id,pdf_path)
+    full_text = ""
+    for chunk in chunks:
+        full_text += chunk
+    final_answer = get_answer(query,full_text)
+    return final_answer

embeddings.py ADDED Viewed

	@@ -0,0 +1,50 @@

+##return embedding vector for a given text
+##uses senetence based emebdings
+from langchain_text_splitters import CharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_core.documents import Document
+from langchain_chroma import Chroma
+model_name = "sentence-transformers/all-MiniLM-L6-v2"
+model_kwargs = {"device": "cpu"}
+encode_kwargs = {"normalize_embeddings": True}
+hf = HuggingFaceEmbeddings(
+model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
+)
+vector_store = Chroma(
+collection_name="collection",
+embedding_function=hf,
+persist_directory="chroma_langchain_db",
+)
+def set_embedding(text:str,doc_id:str,user_id:str):
+    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
+    encoding_name="cl100k_base", chunk_size=300, chunk_overlap=40
+)
+    texts = text_splitter.split_text(text)
+    print(type(texts[0]))## IT IS LIST OF STRINGS
+    for i in range(len(texts)):
+        ##vector=hf.embed_query(texts[i])
+        vector_id=(user_id+doc_id+str(i))
+        globals()[f"document_{i}"]=Document(
+            page_content= texts[i],
+            metadata={"doc_id": doc_id, "user_id": user_id},
+            id= vector_id,
+        )
+        vector_store.add_documents([globals()[f"document_{i}"]])
+        print(f"Added document {i} with id {vector_id}")
+def get_chunks(query:str,user_id:str,doc_id:str):
+    results = vector_store.similarity_search(
+    query,
+    k=5,
+    filter={"user_id": user_id}
+)
+    list_of_chunks=[]
+    for res in results:
+        list_of_chunks.append(res.page_content)
+    return list_of_chunks

response.py ADDED Viewed

	@@ -0,0 +1,28 @@

+##stores embedding vector and corresponding text with vector custom_id
+##filter embeddings by custom_id
+##search for similar embeddings
+##return text data with given vector custom_id
+from dotenv import load_dotenv
+load_dotenv()
+from langchain_mistralai import ChatMistralAI
+llm = ChatMistralAI(
+    model="mistral-large-latest",
+    temperature=0,
+    max_retries=1,
+)
+prompt_tamplet = """
+    You just need to answer the question based on the following context.
+    QUESTIONS : {question}
+    CONTEXT : {context}
+"""
+def get_answer(question:str,context:str):
+    final_prompt = prompt_tamplet.format(question=question, context=context)
+    response = llm.invoke(final_prompt)
+    ##print("from planner :",type(response.content))
+    return response.content