import streamlit as st
import os
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from dotenv import load_dotenv
from PyPDF2 import PdfReader
import time
load_dotenv()
##CSS for the background and sidebar styling
st.markdown(
"""
""",
unsafe_allow_html=True
)
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
groq_api_key = os.getenv("GROQ_API_KEY")
#documentloader -> text splitter -> embeddings -> vector store -> use retriever chains
embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
llm = ChatGroq(model = "Llama3-8b-8192",api_key = groq_api_key)
prompt_template = ChatPromptTemplate.from_template("""
Answer the following question from the provided context only.
Please provide the most accurate response based on the question
{context}
Question : {input}
""")
def get_pdf_text(pdf_docs):
text=""
for pdf in pdf_docs:
pdf_reader= PdfReader(pdf)
for page in pdf_reader.pages:
text+= page.extract_text()
return text
def create_vector_embeddings(pdfText):
if "vectors" not in st.session_state:
st.session_state.docs = get_pdf_text(pdfText)
st.session_state.splitter = RecursiveCharacterTextSplitter(chunk_size=1200,chunk_overlap=400)
st.session_state.final_docs = st.session_state.splitter.split_text(st.session_state.docs)
st.session_state.vectors = FAISS.from_texts(st.session_state.final_docs, embeddings)
if "options" not in st.session_state:
st.session_state.options = ["Select a query"]
if "user_prompt" not in st.session_state:
st.session_state.user_prompt = ""
def autopopulate_promptsbydoctype(uploaded_text):
if uploaded_text and uploaded_text[0].name.endswith("pdf"):
#autopopulate all the questions in pdf
itemsToAppend = ["get all the programme details including rights and tape content etc in pointwise manner, dont miss any info",
"give a structured short summary of the programmes and details",
"give me programme package with programme details listed"]
for itemToAppend in itemsToAppend:
if itemToAppend not in st.session_state.options:
st.session_state.options.append(itemToAppend)
st.title("Basic Document QnA")
with st.sidebar:
st.title("Menu:")
#if "uploaded_text" not in st.session_state:
st.session_state.uploaded_text = st.file_uploader("Upload your Files and Click on the Submit & Process Button", accept_multiple_files=True)
if st.button("Click To Process File"):
with st.spinner("Processing..."):
create_vector_embeddings(st.session_state.uploaded_text)
st.write("Vector Database is ready")
autopopulate_promptsbydoctype(st.session_state.uploaded_text)
# st.markdown('
', unsafe_allow_html=True)
# params = ['docs', 'splitter','final_docs']
# if st.button("Clean Current Document Settings") and st.session_state.keys():
# with st.spinner("Cleaning In Progress...."):
# for param in params:
# if param in st.session_state:
# del st.session_state[param]
# st.session_state['uploaded_text'] = ""
# st.write("Cleanup completed..")
# st.markdown('
', unsafe_allow_html=True)
new_option = st.text_input("Or type your query here:")
if new_option and new_option not in st.session_state.options:
st.session_state.options.append(new_option)
st.session_state.user_prompt = new_option
if st.session_state.uploaded_text and "Technical" not in st.session_state.uploaded_text[0].name:
st.session_state.user_prompt= st.selectbox("Enter/Select your query from the document", st.session_state.options,
index=st.session_state.options.index(st.session_state.user_prompt) if st.session_state.user_prompt in st.session_state.options else 0)
if st.session_state.user_prompt and st.session_state.user_prompt != "Select a query":
#st.write(st.session_state.user_prompt)
document_chain = create_stuff_documents_chain(llm=llm, prompt= prompt_template)
retriever = st.session_state.vectors.as_retriever()
retrieval_chain=create_retrieval_chain(retriever,document_chain)
start = time.process_time()
response = retrieval_chain.invoke({"input": st.session_state.user_prompt})
print(f"Response time :{time.process_time()-start}")
st.write(response['answer'])
## With a streamlit expander
with st.expander("Document similarity Search"):
for i,doc in enumerate(response['context']):
st.write(doc.page_content)
st.write('------------------------')