Spaces:
Runtime error
Runtime error
File size: 4,366 Bytes
060c9d8 fee98e6 060c9d8 156c3a1 060c9d8 c729c7f 060c9d8 1ab4f11 060c9d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from htmlTemplates import css, bot_template, user_template
from langchain.llms import HuggingFaceHub
import os
import numpy as np
#EMBEDDINGS_FILE = "embeddings.npy"
INDEX_FILE = "index.faiss"
def save_embeddings_and_index(index):
#np.save(EMBEDDINGS_FILE, embeddings)
index.save_local(INDEX_FILE)
def load_embeddings_and_index():
if os.path.exists(INDEX_FILE):
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
index = FAISS.load_local(INDEX_FILE, embeddings)
return index
return None
def get_pdf_text(pdf):
text = ""
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_files(text_doc):
text = ""
for file in text_doc:
if file.type == "text/plain":
# Read the text directly from the file
text += file.getvalue().decode("utf-8")
elif file.type == "application/pdf":
text += get_pdf_text(file)
return text
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=900,
chunk_overlap=0,
separators="\n",
add_start_index = True,
length_function= len
)
chunks = text_splitter.split_text(text)
return chunks
def get_vectorstore(text_chunks, index):
if index is None:
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
return vectorstore
else:
index.add_texts(texts=text_chunks)
return index
def get_conversation_chain(vectorstore):
llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", model_kwargs={"temperature":0.2, "max_length":1024})
memory = ConversationBufferMemory(
memory_key='chat_history', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever(),
memory=memory
)
return conversation_chain
def handle_userinput(user_question):
response = st.session_state.conversation({'question': user_question})
st.session_state.chat_history = response['chat_history']
for i, message in enumerate(st.session_state.chat_history):
if i % 2 == 0:
st.write(user_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)
else:
st.write(bot_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)
def main():
load_dotenv()
st.set_page_config(page_title="ChatBot")
st.write(css, unsafe_allow_html=True)
if "conversation" not in st.session_state:
index = load_embeddings_and_index()
if index==None:
st.session_state.conversation = None
else:
st.session_state.conversation = get_conversation_chain(index)
if "chat_history" not in st.session_state:
st.session_state.chat_history = None
st.header("Chat Bot")
user_question = st.text_input("Ask a question:")
if user_question:
handle_userinput(user_question)
with st.sidebar:
st.subheader("Your documents")
pdf_docs = st.file_uploader(
"Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
if st.button("Process"):
with st.spinner("Processing"):
index = load_embeddings_and_index()
raw_text = get_files(pdf_docs)
text_chunks = get_text_chunks(raw_text)
# Load a new faiss index or append to existing (if it exists)
index = get_vectorstore(text_chunks, index)
# save updated faiss index
save_embeddings_and_index(index)
# create conversation chain
st.session_state.conversation = get_conversation_chain(index)
if __name__ == '__main__':
main()
|