Spaces:

DrishtiSharma
/

chat-w-docs-via-speech-or-text

Sleeping

App Files Files Community

DrishtiSharma commited on Mar 31, 2024

Commit

fad1562

verified ·

1 Parent(s): 09d6673

Create app.py

Browse files

Files changed (1) hide show

app.py +133 -0

app.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import streamlit as st
+from dotenv import load_dotenv
+from PyPDF2 import PdfReader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.chat_models import ChatOpenAI
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+from langchain.chat_models.gigachat import GigaChat
+from htmlTemplates import css, bot_template, user_template
+from langchain.llms import HuggingFaceHub, LlamaCpp
+from huggingface_hub import snapshot_download, hf_hub_download
+repo_name = "IlyaGusev/saiga_mistral_7b_gguf"
+model_name = "model-q4_K.gguf"
+#snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)
+def get_pdf_text(pdf_docs):
+    text = ""
+    for pdf in pdf_docs:
+        pdf_reader = PdfReader(pdf)
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    return text
+def get_text_chunks(text):
+    text_splitter = CharacterTextSplitter(separator="\n",
+                                          chunk_size=1000,  # 1000
+                                          chunk_overlap=200,  # 200
+                                          length_function=len
+                                          )
+    chunks = text_splitter.split_text(text)
+    return chunks
+#def get_vectorstore(text_chunks):
+    #embeddings = OpenAIEmbeddings()
+    #embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
+    #embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
+    #embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
+    #vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
+    #return vectorstore
+def get_vectorstore(text_chunks, embedding_model_name="intfloat/multilingual-e5-large"):
+    embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
+    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
+    return vectorstore
+def get_conversation_chain(vectorstore, model_name):
+    llm = GigaChat(profanity=False,
+                   verify_ssl_certs=False
+                  )
+    memory = ConversationBufferMemory(memory_key='chat_history',
+                                      input_key='question',
+                                      output_key='answer',
+                                      return_messages=True)
+    conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm,
+                                                               retriever=vectorstore.as_retriever(),
+                                                               memory=memory,
+                                                               return_source_documents=True
+                                                               )
+    return conversation_chain
+def handle_userinput(user_question):
+    response = st.session_state.conversation({'question': user_question})
+    st.session_state.chat_history = response['chat_history']
+    st.session_state.retrieved_text = response['source_documents']
+    for i, (message, text) in enumerate(zip(st.session_state.chat_history, st.session_state.retrieved_text)):
+        if i % 3 == 0:
+            st.write(user_template.replace(
+                "{{MSG}}", message.content), unsafe_allow_html=True)
+        else:
+            st.write(bot_template.replace(
+                "{{MSG}}", message.content), unsafe_allow_html=True)
+            print(text)
+            st.write(bot_template.replace(
+                "{{MSG}}", str(text.page_content)), unsafe_allow_html=True)
+st.set_page_config(page_title="Chat with multiple PDFs",
+                   page_icon=":books:")
+st.write(css, unsafe_allow_html=True)
+if "conversation" not in st.session_state:
+    st.session_state.conversation = None
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = None
+st.header("Chat with multiple PDFs :books:")
+user_question = st.text_input("Ask a question about your documents: ")
+if user_question:
+    handle_userinput(user_question)
+with st.sidebar:
+    st.subheader("Your documents")
+    embedding_model_name = st.selectbox("Select embedding model", ["intfloat/multilingual-e5-large", "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"])
+    pdf_docs = st.file_uploader(
+        "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
+    if st.button("Process"):
+        with st.spinner("Processing"):
+            # get pdf text
+            raw_text = get_pdf_text(pdf_docs)
+            # get the text chunks
+            text_chunks = get_text_chunks(raw_text)
+            # create vector store
+            vectorstore = get_vectorstore(text_chunks, embedding_model_name)
+            # create conversation chain
+            st.session_state.conversation = get_conversation_chain(vectorstore, model_name)