Spaces:

SoumyaJ
/

PdfQandA

Sleeping

App Files Files Community

SoumyaJ commited on Aug 19, 2024

Commit

b379775

verified ·

1 Parent(s): 473ac4d

Upload 2 files

Browse files

Files changed (2) hide show

app.py +156 -0
requirements.txt +11 -0

app.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import streamlit as st
+import os
+from langchain_groq import ChatGroq
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains import create_retrieval_chain
+from langchain_community.vectorstores import FAISS
+from langchain_community.document_loaders import PyPDFLoader
+from dotenv import load_dotenv
+from PyPDF2 import PdfReader
+import time
+load_dotenv()
+##CSS for the background and sidebar styling
+st.markdown(
+    """
+    <style>
+    .stApp {
+        background-image: url('https://www.transparenttextures.com/patterns/white-leather.png');
+        background-size: cover;
+    }
+    .sidebar .sidebar-content {
+        padding: 20px;
+        background-image: url('https://www.transparenttextures.com/patterns/asfalt-light.png');
+        background-size: cover;
+        border-radius: 10px;
+        box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.1);
+    }
+    .sidebar .bottom-button {
+        position: fixed;
+        bottom: 20px;
+        left: 20px;
+        width: calc(100% - 40px);
+    }
+    </style>
+    """,
+    unsafe_allow_html=True
+)
+os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
+groq_api_key = os.getenv("GROQ_API_KEY")
+#documentloader -> text splitter -> embeddings -> vector store -> use retriever chains
+embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
+llm = ChatGroq(model = "Llama3-8b-8192",api_key = groq_api_key)
+prompt_template = ChatPromptTemplate.from_template("""
+Answer the following question from the provided context only.
+Please provide the most accurate response based on the question
+<context>
+{context}
+</context>
+Question : {input}
+""")
+def get_pdf_text(pdf_docs):
+    text=""
+    for pdf in pdf_docs:
+        pdf_reader= PdfReader(pdf)
+        for page in pdf_reader.pages:
+            text+= page.extract_text()
+    return  text
+def create_vector_embeddings(pdfText):
+    if "vectors" not in st.session_state:
+        st.session_state.docs = get_pdf_text(pdfText)
+        st.session_state.splitter = RecursiveCharacterTextSplitter(chunk_size=1200,chunk_overlap=400)
+        st.session_state.final_docs = st.session_state.splitter.split_text(st.session_state.docs)
+        st.session_state.vectors = FAISS.from_texts(st.session_state.final_docs, embeddings)
+if "options" not in st.session_state:
+    st.session_state.options = ["Select a query"]
+if "user_prompt" not in st.session_state:
+    st.session_state.user_prompt = ""
+def autopopulate_promptsbydoctype(uploaded_text):
+    if uploaded_text and uploaded_text[0].name.endswith("pdf"):
+        #autopopulate all the questions in pdf
+        itemsToAppend = ["get all the programme details including rights and tape content etc in pointwise manner, dont miss any info",
+                         "give a structured short summary of the programmes and details",
+                         "give me programme package with programme details listed"]
+        for itemToAppend in itemsToAppend:
+            if itemToAppend not in st.session_state.options:
+                st.session_state.options.append(itemToAppend)
+st.title("Basic Document QnA")
+with st.sidebar:
+        st.title("Menu:")
+        #if "uploaded_text" not in st.session_state:
+        st.session_state.uploaded_text = st.file_uploader("Upload your Files and Click on the Submit & Process Button", accept_multiple_files=True)
+        if st.button("Click To Process File"):
+            with st.spinner("Processing..."):
+                create_vector_embeddings(st.session_state.uploaded_text)
+                st.write("Vector Database is ready")
+                autopopulate_promptsbydoctype(st.session_state.uploaded_text)
+        # st.markdown('<div class="bottom-button">', unsafe_allow_html=True)
+        # params  = ['docs', 'splitter','final_docs']
+        # if st.button("Clean Current Document Settings") and st.session_state.keys():
+        #     with st.spinner("Cleaning In Progress...."):
+        #         for param in params:
+        #             if param in st.session_state:
+        #                 del st.session_state[param]
+        #         st.session_state['uploaded_text'] = ""
+        #         st.write("Cleanup completed..")
+        # st.markdown('</div>', unsafe_allow_html=True)
+new_option = st.text_input("Or type your query here:")
+if new_option and new_option not in st.session_state.options:
+    st.session_state.options.append(new_option)
+    st.session_state.user_prompt = new_option
+if st.session_state.uploaded_text and "Technical" not in st.session_state.uploaded_text[0].name:
+    st.session_state.user_prompt= st.selectbox("Enter/Select your query from the document", st.session_state.options,
+        index=st.session_state.options.index(st.session_state.user_prompt) if st.session_state.user_prompt in st.session_state.options else 0)
+if st.session_state.user_prompt and st.session_state.user_prompt != "Select a query":
+    #st.write(st.session_state.user_prompt)
+    document_chain = create_stuff_documents_chain(llm=llm, prompt= prompt_template)
+    retriever = st.session_state.vectors.as_retriever()
+    retrieval_chain=create_retrieval_chain(retriever,document_chain)
+    start = time.process_time()
+    response = retrieval_chain.invoke({"input": st.session_state.user_prompt})
+    print(f"Response time :{time.process_time()-start}")
+    st.write(response['answer'])
+     ## With a streamlit expander
+    with st.expander("Document similarity Search"):
+        for i,doc in enumerate(response['context']):
+            st.write(doc.page_content)
+            st.write('------------------------')

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+langchain
+langchain-community
+pypdf
+langchain_huggingface
+faiss-cpu
+langchain-text-splitters
+python-dotenv
+sentence_transformers
+langchain_groq
+streamlit
+PyPDF2