Spaces:

SoumyaJ
/

PdfQandA

Sleeping

File size: 6,128 Bytes

b379775

import streamlit as st
import os
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from dotenv import load_dotenv
from PyPDF2 import PdfReader
import time

load_dotenv()

##CSS for the background and sidebar styling
st.markdown(
    """

    <style>

    .stApp {

        background-image: url('https://www.transparenttextures.com/patterns/white-leather.png');

        background-size: cover;

    }

    .sidebar .sidebar-content {

        padding: 20px;

        background-image: url('https://www.transparenttextures.com/patterns/asfalt-light.png');

        background-size: cover;

        border-radius: 10px;

        box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.1);

    }

    .sidebar .bottom-button {

        position: fixed;

        bottom: 20px;

        left: 20px;

        width: calc(100% - 40px);

    }

    </style>

    """,
    unsafe_allow_html=True
)

os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
groq_api_key = os.getenv("GROQ_API_KEY")

#documentloader -> text splitter -> embeddings -> vector store -> use retriever chains 
embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")

llm = ChatGroq(model = "Llama3-8b-8192",api_key = groq_api_key)

prompt_template = ChatPromptTemplate.from_template("""

Answer the following question from the provided context only. 

Please provide the most accurate response based on the question

<context>

{context}

</context>                                                   

Question : {input}

""")

def get_pdf_text(pdf_docs):
    text=""
    for pdf in pdf_docs:
        pdf_reader= PdfReader(pdf)
        for page in pdf_reader.pages:
            text+= page.extract_text()
    return  text

def create_vector_embeddings(pdfText):
    if "vectors" not in st.session_state:       
        st.session_state.docs = get_pdf_text(pdfText)
        st.session_state.splitter = RecursiveCharacterTextSplitter(chunk_size=1200,chunk_overlap=400)
        st.session_state.final_docs = st.session_state.splitter.split_text(st.session_state.docs)    
        st.session_state.vectors = FAISS.from_texts(st.session_state.final_docs, embeddings)

if "options" not in st.session_state:
    st.session_state.options = ["Select a query"]

if "user_prompt" not in st.session_state:
    st.session_state.user_prompt = ""

def autopopulate_promptsbydoctype(uploaded_text):
    if uploaded_text and uploaded_text[0].name.endswith("pdf"):
        #autopopulate all the questions in pdf
        itemsToAppend = ["get all the programme details including rights and tape content etc in pointwise manner, dont miss any info",
                         "give a structured short summary of the programmes and details",
                         "give me programme package with programme details listed"]        
        
        for itemToAppend in itemsToAppend:
            if itemToAppend not in st.session_state.options:
                st.session_state.options.append(itemToAppend)            

st.title("Basic Document QnA")

with st.sidebar:
        st.title("Menu:")
        #if "uploaded_text" not in st.session_state:
        st.session_state.uploaded_text = st.file_uploader("Upload your Files and Click on the Submit & Process Button", accept_multiple_files=True)        
        if st.button("Click To Process File"):
            with st.spinner("Processing..."):
                create_vector_embeddings(st.session_state.uploaded_text)
                st.write("Vector Database is ready")                
                autopopulate_promptsbydoctype(st.session_state.uploaded_text)  

        # st.markdown('<div class="bottom-button">', unsafe_allow_html=True)
        # params  = ['docs', 'splitter','final_docs']    
        # if st.button("Clean Current Document Settings") and st.session_state.keys():
        #     with st.spinner("Cleaning In Progress...."):  
        #         for param in params:                        
        #             if param in st.session_state:
        #                 del st.session_state[param]
                    
        #         st.session_state['uploaded_text'] = ""
        #         st.write("Cleanup completed..")
        # st.markdown('</div>', unsafe_allow_html=True)

 
new_option = st.text_input("Or type your query here:") 

if new_option and new_option not in st.session_state.options:
    st.session_state.options.append(new_option)
    st.session_state.user_prompt = new_option

if st.session_state.uploaded_text and "Technical" not in st.session_state.uploaded_text[0].name:
    st.session_state.user_prompt= st.selectbox("Enter/Select your query from the document", st.session_state.options,                                           
        index=st.session_state.options.index(st.session_state.user_prompt) if st.session_state.user_prompt in st.session_state.options else 0) 

if st.session_state.user_prompt and st.session_state.user_prompt != "Select a query":
    #st.write(st.session_state.user_prompt)
    document_chain = create_stuff_documents_chain(llm=llm, prompt= prompt_template)
    retriever = st.session_state.vectors.as_retriever()
    retrieval_chain=create_retrieval_chain(retriever,document_chain)

    start = time.process_time()
    response = retrieval_chain.invoke({"input": st.session_state.user_prompt})
    print(f"Response time :{time.process_time()-start}")

    st.write(response['answer'])

     ## With a streamlit expander
    with st.expander("Document similarity Search"):
        for i,doc in enumerate(response['context']):
            st.write(doc.page_content)
            st.write('------------------------')