import streamlit as st import os from langchain_groq import ChatGroq from langchain_huggingface import HuggingFaceEmbeddings from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain.chains.combine_documents import create_stuff_documents_chain from langchain_core.prompts import ChatPromptTemplate from langchain.chains import create_retrieval_chain from langchain_community.vectorstores import FAISS from langchain_community.document_loaders import PyPDFLoader from dotenv import load_dotenv from PyPDF2 import PdfReader import time load_dotenv() ##CSS for the background and sidebar styling st.markdown( """ """, unsafe_allow_html=True ) os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN") groq_api_key = os.getenv("GROQ_API_KEY") #documentloader -> text splitter -> embeddings -> vector store -> use retriever chains embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2") llm = ChatGroq(model = "Llama3-8b-8192",api_key = groq_api_key) prompt_template = ChatPromptTemplate.from_template(""" Answer the following question from the provided context only. Please provide the most accurate response based on the question {context} Question : {input} """) def get_pdf_text(pdf_docs): text="" for pdf in pdf_docs: pdf_reader= PdfReader(pdf) for page in pdf_reader.pages: text+= page.extract_text() return text def create_vector_embeddings(pdfText): if "vectors" not in st.session_state: st.session_state.docs = get_pdf_text(pdfText) st.session_state.splitter = RecursiveCharacterTextSplitter(chunk_size=1200,chunk_overlap=400) st.session_state.final_docs = st.session_state.splitter.split_text(st.session_state.docs) st.session_state.vectors = FAISS.from_texts(st.session_state.final_docs, embeddings) if "options" not in st.session_state: st.session_state.options = ["Select a query"] if "user_prompt" not in st.session_state: st.session_state.user_prompt = "" def autopopulate_promptsbydoctype(uploaded_text): if uploaded_text and uploaded_text[0].name.endswith("pdf"): #autopopulate all the questions in pdf itemsToAppend = ["get all the programme details including rights and tape content etc in pointwise manner, dont miss any info", "give a structured short summary of the programmes and details", "give me programme package with programme details listed"] for itemToAppend in itemsToAppend: if itemToAppend not in st.session_state.options: st.session_state.options.append(itemToAppend) st.title("Basic Document QnA") with st.sidebar: st.title("Menu:") #if "uploaded_text" not in st.session_state: st.session_state.uploaded_text = st.file_uploader("Upload your Files and Click on the Submit & Process Button", accept_multiple_files=True) if st.button("Click To Process File"): with st.spinner("Processing..."): create_vector_embeddings(st.session_state.uploaded_text) st.write("Vector Database is ready") autopopulate_promptsbydoctype(st.session_state.uploaded_text) # st.markdown('

', unsafe_allow_html=True) # params = ['docs', 'splitter','final_docs'] # if st.button("Clean Current Document Settings") and st.session_state.keys(): # with st.spinner("Cleaning In Progress...."): # for param in params: # if param in st.session_state: # del st.session_state[param] # st.session_state['uploaded_text'] = "" # st.write("Cleanup completed..") # st.markdown('

', unsafe_allow_html=True) new_option = st.text_input("Or type your query here:") if new_option and new_option not in st.session_state.options: st.session_state.options.append(new_option) st.session_state.user_prompt = new_option if st.session_state.uploaded_text and "Technical" not in st.session_state.uploaded_text[0].name: st.session_state.user_prompt= st.selectbox("Enter/Select your query from the document", st.session_state.options, index=st.session_state.options.index(st.session_state.user_prompt) if st.session_state.user_prompt in st.session_state.options else 0) if st.session_state.user_prompt and st.session_state.user_prompt != "Select a query": #st.write(st.session_state.user_prompt) document_chain = create_stuff_documents_chain(llm=llm, prompt= prompt_template) retriever = st.session_state.vectors.as_retriever() retrieval_chain=create_retrieval_chain(retriever,document_chain) start = time.process_time() response = retrieval_chain.invoke({"input": st.session_state.user_prompt}) print(f"Response time :{time.process_time()-start}") st.write(response['answer']) ## With a streamlit expander with st.expander("Document similarity Search"): for i,doc in enumerate(response['context']): st.write(doc.page_content) st.write('------------------------')