chatwithpdf / app.py
rahgadda's picture
Initial Draft
052b12a verified
raw
history blame
7.14 kB
import streamlit as st
import tempfile
import os
import re
import torch
from threading import Thread
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextIteratorStreamer
from langchain_community.document_loaders import PyPDFLoader
from langchain.vectorstores.faiss import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
# Function return langchain document object of PDF pages
def fn_read_pdf(lv_temp_file_path, mv_processing_message):
"""Returns langchain document object of PDF pages"""
lv_pdf_loader = PyPDFLoader(lv_temp_file_path)
lv_pdf_content = lv_pdf_loader.load()
print("Step2: PDF content extracted")
mv_processing_message.text("Step2: PDF content extracted")
return lv_pdf_content
# Function return FAISS Vector store
def fn_create_faiss_vector_store(lv_pdf_content, mv_processing_message):
"""Returns FAISS vector store index of PDF Content"""
lv_embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/msmarco-distilbert-base-v4",
model_kwargs={'device': 'cpu'},
encode_kwargs={'normalize_embeddings': False}
)
lv_vector_store = FAISS.from_documents(lv_pdf_content, lv_embeddings)
print("Step3: Vector store created")
mv_processing_message.text("Step3: Vector store created")
return lv_vector_store
# Function return QA Response using Vector Store
def fn_generate_QnA_response(mv_selected_model, mv_user_question, lv_vector_store, mv_processing_message):
"""Returns QA Response using Vector Store"""
lv_chat_history = []
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
else:
lv_chat_history = st.session_state.chat_history
print("Step4: Generating LLM response")
mv_processing_message.text("Step4: Generating LLM response")
lv_tokenizer = AutoTokenizer.from_pretrained(mv_selected_model, trust_remote_code=True)
lv_model = AutoModelForCausalLM.from_pretrained(
mv_selected_model,
torch_dtype="auto",
device_map="cpu",
trust_remote_code=True
)
# lv_streamer = TextIteratorStreamer(
# tokenizer=lv_tokenizer,
# skip_prompt=True,
# skip_special_tokens=True,
# timeout=300.0
# )
lv_ms_phi2_pipeline = pipeline(
"text-generation", tokenizer=lv_tokenizer, model=lv_model,
device_map="cpu", max_new_tokens=512, return_full_text=True
)
lv_hf_phi2_pipeline = HuggingFacePipeline(pipeline=lv_ms_phi2_pipeline)
lv_chain = ConversationalRetrievalChain.from_llm(lv_hf_phi2_pipeline, lv_vector_store.as_retriever(), return_source_documents=True)
lv_response = lv_chain({"question": mv_user_question, 'chat_history': lv_chat_history})
lv_chat_history += [(mv_user_question, lv_response["answer"])]
st.session_state.chat_history = lv_chat_history
print("Step5: LLM response generated")
mv_processing_message.text("Step5: LLM response generated")
return lv_response['answer']
# Main Function
def main():
# -- Streamlit Settings
st.set_page_config(layout='wide')
# -- Initialize chat history
if "messages" not in st.session_state:
st.session_state.messages = []
col1, col2, col3 = st.columns(3)
col2.title("Chat with your PDF")
st.text("")
col1, col2, col3 = st.columns(3)
mv_selected_model=col3.selectbox('Select Model',['microsoft/phi-2'])
st.text("")
st.text("")
st.text("")
col1, col2, col3 = st.columns(3)
# -- Reading PDF File
mv_pdf_input_file = col2.file_uploader("Choose a PDF file:", type=["pdf"])
if 'mv_temp_file_storage_dir' not in st.session_state:
mv_temp_file_storage_dir = tempfile.mkdtemp()
st.session_state.mv_temp_file_storage_dir = mv_temp_file_storage_dir
else:
mv_temp_file_storage_dir = st.session_state.mv_temp_file_storage_dir
mv_processing_message = col2.empty()
st.text("")
st.text("")
st.text("")
st.text("")
st.text("")
st.text("")
mv_vector_storage_dir = "/workspace/knowledge-base/01-ML/01-dev/adhoc/Talk2PDF/vector_store"
if (mv_pdf_input_file is not None):
mv_file_name = mv_pdf_input_file.name
# mv_vectorstore_file_name = os.path.join(mv_vector_storage_dir, mv_file_name[:-4] + ".vectorstore")
# mv_metadata_file_name = os.path.join(mv_vector_storage_dir, mv_file_name[:-4] + ".metadata")
if 'lv_vector_store' not in st.session_state:
# -- Storing Uploaded PDF locally
lv_temp_file_path = os.path.join(mv_temp_file_storage_dir,mv_file_name)
with open(lv_temp_file_path,"wb") as lv_file:
lv_file.write(mv_pdf_input_file.getbuffer())
print("Step1: PDF uploaded successfully at -> " + lv_temp_file_path)
mv_processing_message.text("Step1: PDF uploaded successfully at -> " + lv_temp_file_path)
# -- Extracting PDF Text
lv_pdf_content = fn_read_pdf(lv_temp_file_path, mv_processing_message)
# -- Creating FAISS Vector Store
lv_vector_store = fn_create_faiss_vector_store(lv_pdf_content, mv_processing_message)
st.session_state.lv_vector_store = lv_vector_store
else:
lv_vector_store = st.session_state.lv_vector_store
# -- Taking input question and generate answer
col1, col2, col3 = st.columns(3)
lv_chat_history = col2.chat_message
if mv_user_question := col2.chat_input("Chat on PDF Data"):
# -- Add user message to chat history
st.session_state.messages.append({"role": "user", "content": mv_user_question})
# -- Generating LLM response
lv_response = fn_generate_QnA_response(mv_selected_model, mv_user_question, lv_vector_store, mv_processing_message)
# -- Adding assistant response to chat history
st.session_state.messages.append({"role": "assistant", "content": lv_response})
# -- Display chat messages from history on app rerun
for message in st.session_state.messages:
with lv_chat_history(message["role"]):
st.markdown(message["content"])
# Calling Main Function
if __name__ == '__main__':
main()