qorgh346 commited on
Commit
47313bf
·
1 Parent(s): b67e798
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +200 -0
  3. htmlTemplates.py +44 -0
  4. requirements.txt +12 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.gguf filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
+ from langchain.vectorstores import FAISS, Chroma
7
+ from langchain.embeddings import HuggingFaceEmbeddings # General embeddings from HuggingFace models.
8
+ from langchain.chat_models import ChatOpenAI
9
+ from langchain.memory import ConversationBufferMemory
10
+ from langchain.chains import ConversationalRetrievalChain
11
+ from htmlTemplates import css, bot_template, user_template
12
+ from langchain.llms import HuggingFaceHub, LlamaCpp,CTransformers # For loading transformer models.
13
+ from langchain.document_loaders import PyPDFLoader
14
+ from tempfile import NamedTemporaryFile
15
+ def get_pdf_text(pdf_docs):
16
+ # text = ''
17
+ # pdf_file_ = open(pdf_docs,'rb')
18
+ # text = "example hofjin"
19
+
20
+
21
+ # for page in pdf_reader.pages:
22
+ # text += page.extract_text()
23
+
24
+ # return text
25
+ with NamedTemporaryFile() as temp_file:
26
+ temp_file.write(pdf_docs.getvalue())
27
+ temp_file.seek(0)
28
+ pdf_loader = PyPDFLoader(temp_file.name)
29
+ # print('pdf_loader = ', pdf_loader)
30
+ pdf_doc = pdf_loader.load()
31
+ # print('pdf_doc = ',pdf_doc)
32
+ return pdf_doc
33
+
34
+
35
+ def get_text_chunks(documents):
36
+
37
+ text_splitter = RecursiveCharacterTextSplitter(
38
+ chunk_size = 1000,
39
+ chunk_overlap = 200,
40
+ length_function= len
41
+ )
42
+ # text_splitter = CharacterTextSplitter(
43
+ # separator="\n",
44
+ # chunk_size=10f00,
45
+ # chunk_overlap=200,
46
+ # length_function=len
47
+ # )
48
+ documents = text_splitter.split_documents(documents)
49
+ print('documents = ', documents)
50
+ return documents
51
+
52
+
53
+ def get_vectorstore(text_chunks):
54
+ # Load the desired embeddings model.
55
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L12-v2',
56
+ model_kwargs={'device': 'cpu'})
57
+ print('embeddings = ', embeddings)
58
+ # embeddings = OpenAIEmbeddings()sentence-transformers/all-MiniLM-L6-v2
59
+ # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
60
+ # model_kwargs={'device':'cpu'})
61
+ vectorstore = FAISS.from_documents(texts=text_chunks, embedding=embeddings)
62
+ # vectorstore = Chroma.from_texts(texts=text_chunks, embedding=embeddings)
63
+
64
+ return vectorstore
65
+
66
+
67
+ def get_conversation_chain(vectorstore):
68
+
69
+ model_path = 'llama-2-7b-chat.Q2_K.gguf'
70
+ # llm = ChatOpenAI()
71
+ # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
72
+ config = {'max_new_tokens': 2048}
73
+
74
+
75
+ # llm = CTransformers(model="llama-2-7b-chat.ggmlv3.q2_K.bin", model_type="llama", config=config)
76
+
77
+ llm = LlamaCpp(model_path=model_path,
78
+ input={"temperature": 0.75, "max_length": 2000, "top_p": 1},
79
+ verbose=True, )
80
+ memory = ConversationBufferMemory(
81
+ memory_key='chat_history', return_messages=True)
82
+ conversation_chain = ConversationalRetrievalChain.from_llm(
83
+ llm=llm,
84
+ retriever=vectorstore.as_retriever(),
85
+ memory=memory
86
+ )
87
+ return conversation_chain
88
+
89
+
90
+ def handle_userinput(user_question):
91
+ response = st.session_state.conversation({'question': user_question})
92
+ st.session_state.chat_history = response['chat_history']
93
+
94
+ for i, message in enumerate(st.session_state.chat_history):
95
+ if i % 2 == 0:
96
+ st.write(user_template.replace(
97
+ "{{MSG}}", message.content), unsafe_allow_html=True)
98
+ else:
99
+ st.write(bot_template.replace(
100
+ "{{MSG}}", message.content), unsafe_allow_html=True)
101
+
102
+ def get_text_file(docs):
103
+ text = docs.read().decode("utf-8")
104
+ return text
105
+
106
+ def get_csv_file(docs):
107
+ import pandas as pd
108
+ text = ''
109
+
110
+ data = pd.read_csv(docs)
111
+
112
+ for index, row in data.iterrows():
113
+ item_name = row[0]
114
+ row_text = item_name
115
+ for col_name in data.columns[1:]:
116
+ row_text += '{} is {} '.format(col_name, row[col_name])
117
+ text += row_text + '\n'
118
+
119
+ return text
120
+
121
+ def get_json_file(docs):
122
+ import json
123
+ text = ''
124
+ # with open(docs, 'r') as f:
125
+ json_data = json.load(docs)
126
+
127
+ for f_key, f_value in json_data.items():
128
+ for s_value in f_value:
129
+ text += str(f_key) + str(s_value)
130
+ text += '\n'
131
+ #print(text)
132
+ return text
133
+
134
+ def get_hwp_file(docs):
135
+ pass
136
+
137
+ def get_docs_file(docs):
138
+ pass
139
+
140
+
141
+ def main():
142
+ load_dotenv()
143
+ st.set_page_config(page_title="Chat with multiple PDFs",
144
+ page_icon=":books:")
145
+ st.write(css, unsafe_allow_html=True)
146
+
147
+ if "conversation" not in st.session_state:
148
+ st.session_state.conversation = None
149
+ if "chat_history" not in st.session_state:
150
+ st.session_state.chat_history = None
151
+
152
+ st.header("Chat with multiple PDFs :books:")
153
+ user_question = st.text_input("Ask a question about your documents:")
154
+ if user_question:
155
+ handle_userinput(user_question)
156
+
157
+ with st.sidebar:
158
+ st.subheader("Your documents")
159
+ docs = st.file_uploader(
160
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
161
+ if st.button("Process"):
162
+ with st.spinner("Processing"):
163
+ # get pdf text
164
+ doc_list = []
165
+
166
+ for file in docs:
167
+ print('file - type : ', file.type)
168
+ if file.type == 'text/plain':
169
+ #file is .txt
170
+ raw_text += get_text_file(file)
171
+ elif file.type in ['application/octet-stream', 'application/pdf']:
172
+ #file is .pdf
173
+ doc_list.append(get_pdf_text(file))
174
+ elif file.type == 'text/csv':
175
+ #file is .csv
176
+ raw_text += get_csv_file(file)
177
+ elif file.type == 'application/json':
178
+ # file is .json
179
+ raw_text += get_json_file(file)
180
+ elif file.type == 'application/x-hwp':
181
+ # file is .hwp
182
+ raw_text += get_hwp_file(file)
183
+ elif file.type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
184
+ # file is .docs
185
+ raw_text += get_docs_file(file)
186
+
187
+
188
+ # get the text chunks
189
+ text_chunks = get_text_chunks(doc_list)
190
+
191
+ # create vector store
192
+ vectorstore = get_vectorstore(text_chunks)
193
+
194
+ # create conversation chain
195
+ st.session_state.conversation = get_conversation_chain(
196
+ vectorstore)
197
+
198
+
199
+ if __name__ == '__main__':
200
+ main()
htmlTemplates.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ css = '''
2
+ <style>
3
+ .chat-message {
4
+ padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
5
+ }
6
+ .chat-message.user {
7
+ background-color: #2b313e
8
+ }
9
+ .chat-message.bot {
10
+ background-color: #475063
11
+ }
12
+ .chat-message .avatar {
13
+ width: 20%;
14
+ }
15
+ .chat-message .avatar img {
16
+ max-width: 78px;
17
+ max-height: 78px;
18
+ border-radius: 50%;
19
+ object-fit: cover;
20
+ }
21
+ .chat-message .message {
22
+ width: 80%;
23
+ padding: 0 1.5rem;
24
+ color: #fff;
25
+ }
26
+ '''
27
+
28
+ bot_template = '''
29
+ <div class="chat-message bot">
30
+ <div class="avatar">
31
+ <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
32
+ </div>
33
+ <div class="message">{{MSG}}</div>
34
+ </div>
35
+ '''
36
+
37
+ user_template = '''
38
+ <div class="chat-message user">
39
+ <div class="avatar">
40
+ <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
41
+ </div>
42
+ <div class="message">{{MSG}}</div>
43
+ </div>
44
+ '''
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ llama-cpp-python
3
+ PyPDF2==3.0.1
4
+ faiss-cpu==1.7.4
5
+ ctransformers
6
+ pypdf
7
+ chromadb
8
+ tiktoken
9
+ pysqlite3-binary
10
+ streamlit-extras
11
+ InstructorEmbedding
12
+ sentence-transformers