DrishtiSharma commited on
Commit
fad1562
Β·
verified Β·
1 Parent(s): 09d6673

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -0
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ from langchain.chat_models.gigachat import GigaChat
11
+ from htmlTemplates import css, bot_template, user_template
12
+ from langchain.llms import HuggingFaceHub, LlamaCpp
13
+ from huggingface_hub import snapshot_download, hf_hub_download
14
+
15
+
16
+
17
+ repo_name = "IlyaGusev/saiga_mistral_7b_gguf"
18
+ model_name = "model-q4_K.gguf"
19
+
20
+ #snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)
21
+
22
+
23
+ def get_pdf_text(pdf_docs):
24
+ text = ""
25
+ for pdf in pdf_docs:
26
+ pdf_reader = PdfReader(pdf)
27
+ for page in pdf_reader.pages:
28
+ text += page.extract_text()
29
+
30
+ return text
31
+
32
+
33
+ def get_text_chunks(text):
34
+ text_splitter = CharacterTextSplitter(separator="\n",
35
+ chunk_size=1000, # 1000
36
+ chunk_overlap=200, # 200
37
+ length_function=len
38
+ )
39
+ chunks = text_splitter.split_text(text)
40
+
41
+ return chunks
42
+
43
+
44
+ #def get_vectorstore(text_chunks):
45
+ #embeddings = OpenAIEmbeddings()
46
+ #embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
47
+ #embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
48
+ #embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
49
+ #vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
50
+
51
+ #return vectorstore
52
+
53
+ def get_vectorstore(text_chunks, embedding_model_name="intfloat/multilingual-e5-large"):
54
+ embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
55
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
56
+ return vectorstore
57
+
58
+
59
+
60
+ def get_conversation_chain(vectorstore, model_name):
61
+
62
+
63
+ llm = GigaChat(profanity=False,
64
+ verify_ssl_certs=False
65
+ )
66
+
67
+ memory = ConversationBufferMemory(memory_key='chat_history',
68
+ input_key='question',
69
+ output_key='answer',
70
+ return_messages=True)
71
+
72
+ conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm,
73
+ retriever=vectorstore.as_retriever(),
74
+ memory=memory,
75
+ return_source_documents=True
76
+ )
77
+
78
+ return conversation_chain
79
+
80
+
81
+ def handle_userinput(user_question):
82
+
83
+ response = st.session_state.conversation({'question': user_question})
84
+
85
+ st.session_state.chat_history = response['chat_history']
86
+
87
+ st.session_state.retrieved_text = response['source_documents']
88
+
89
+ for i, (message, text) in enumerate(zip(st.session_state.chat_history, st.session_state.retrieved_text)):
90
+ if i % 3 == 0:
91
+ st.write(user_template.replace(
92
+ "{{MSG}}", message.content), unsafe_allow_html=True)
93
+ else:
94
+ st.write(bot_template.replace(
95
+ "{{MSG}}", message.content), unsafe_allow_html=True)
96
+ print(text)
97
+ st.write(bot_template.replace(
98
+ "{{MSG}}", str(text.page_content)), unsafe_allow_html=True)
99
+
100
+
101
+ st.set_page_config(page_title="Chat with multiple PDFs",
102
+ page_icon=":books:")
103
+ st.write(css, unsafe_allow_html=True)
104
+
105
+ if "conversation" not in st.session_state:
106
+ st.session_state.conversation = None
107
+ if "chat_history" not in st.session_state:
108
+ st.session_state.chat_history = None
109
+
110
+ st.header("Chat with multiple PDFs :books:")
111
+ user_question = st.text_input("Ask a question about your documents: ")
112
+
113
+ if user_question:
114
+ handle_userinput(user_question)
115
+
116
+ with st.sidebar:
117
+ st.subheader("Your documents")
118
+ embedding_model_name = st.selectbox("Select embedding model", ["intfloat/multilingual-e5-large", "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"])
119
+ pdf_docs = st.file_uploader(
120
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
121
+ if st.button("Process"):
122
+ with st.spinner("Processing"):
123
+ # get pdf text
124
+ raw_text = get_pdf_text(pdf_docs)
125
+
126
+ # get the text chunks
127
+ text_chunks = get_text_chunks(raw_text)
128
+
129
+ # create vector store
130
+ vectorstore = get_vectorstore(text_chunks, embedding_model_name)
131
+
132
+ # create conversation chain
133
+ st.session_state.conversation = get_conversation_chain(vectorstore, model_name)