ChaitanyaFM commited on
Commit
060c9d8
·
1 Parent(s): 3a90b59

Created index file to store the indices

Browse files
__pycache__/htmlTemplates.cpython-311.pyc CHANGED
Binary files a/__pycache__/htmlTemplates.cpython-311.pyc and b/__pycache__/htmlTemplates.cpython-311.pyc differ
 
app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.memory import ConversationBufferMemory
8
+ from langchain.chains import ConversationalRetrievalChain
9
+ from htmlTemplates import css, bot_template, user_template
10
+ from langchain.llms import HuggingFaceHub
11
+ import os
12
+ import numpy as np
13
+
14
+ # EMBEDDINGS_FILE = "embeddings.npy"
15
+ INDEX_FILE = "index.faiss"
16
+
17
+ def save_embeddings_and_index(index):
18
+ # np.save(EMBEDDINGS_FILE, embeddings)
19
+ index.save_local(INDEX_FILE)
20
+
21
+ def load_embeddings_and_index():
22
+ if os.path.exists(INDEX_FILE):
23
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
24
+ index = FAISS.load_local(INDEX_FILE, embeddings)
25
+ return index
26
+ return None
27
+
28
+
29
+ def get_pdf_text(pdf):
30
+ text = ""
31
+ pdf_reader = PdfReader(pdf)
32
+ for page in pdf_reader.pages:
33
+ text += page.extract_text()
34
+ return text
35
+
36
+
37
+ def get_files(text_doc):
38
+ text = ""
39
+ for file in text_doc:
40
+ if file.type == "text/plain":
41
+ # Read the text directly from the file
42
+ text += file.getvalue().decode("utf-8")
43
+ elif file.type == "application/pdf":
44
+ text += get_pdf_text(file)
45
+ return text
46
+
47
+
48
+ def get_text_chunks(text):
49
+ text_splitter = RecursiveCharacterTextSplitter(
50
+ chunk_size=900,
51
+ chunk_overlap=0,
52
+ separators="\n",
53
+ add_start_index = True,
54
+ length_function= len
55
+ )
56
+ chunks = text_splitter.split_text(text)
57
+ return chunks
58
+
59
+
60
+ def get_vectorstore(text_chunks, index):
61
+ if index is None:
62
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
63
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
64
+ return vectorstore
65
+ else:
66
+ index.add_texts(texts=text_chunks)
67
+ return index
68
+
69
+
70
+
71
+ def get_conversation_chain(vectorstore):
72
+ llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.2, "max_length":1024})
73
+
74
+ memory = ConversationBufferMemory(
75
+ memory_key='chat_history', return_messages=True)
76
+ conversation_chain = ConversationalRetrievalChain.from_llm(
77
+ llm=llm,
78
+ retriever=vectorstore.as_retriever(),
79
+ memory=memory
80
+ )
81
+ return conversation_chain
82
+
83
+
84
+ def handle_userinput(user_question):
85
+ response = st.session_state.conversation({'question': user_question})
86
+ st.session_state.chat_history = response['chat_history']
87
+
88
+ for i, message in enumerate(st.session_state.chat_history):
89
+ if i % 2 == 0:
90
+ st.write(user_template.replace(
91
+ "{{MSG}}", message.content), unsafe_allow_html=True)
92
+ else:
93
+ st.write(bot_template.replace(
94
+ "{{MSG}}", message.content), unsafe_allow_html=True)
95
+
96
+
97
+ def main():
98
+ load_dotenv()
99
+ st.set_page_config(page_title="ChatBot")
100
+ st.write(css, unsafe_allow_html=True)
101
+
102
+ if "conversation" not in st.session_state:
103
+ index = load_embeddings_and_index()
104
+ st.session_state.conversation = get_conversation_chain(index)
105
+ if "chat_history" not in st.session_state:
106
+ st.session_state.chat_history = None
107
+
108
+ st.header("Chat Bot")
109
+ user_question = st.text_input("Ask a question:")
110
+ if user_question:
111
+ handle_userinput(user_question)
112
+
113
+ with st.sidebar:
114
+ st.subheader("Your documents")
115
+ pdf_docs = st.file_uploader(
116
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
117
+ if st.button("Process"):
118
+ with st.spinner("Processing"):
119
+ index = load_embeddings_and_index()
120
+ raw_text = get_files(pdf_docs)
121
+ text_chunks = get_text_chunks(raw_text)
122
+ # Load a new faiss index or append to existing (if it exists)
123
+ index = get_vectorstore(text_chunks, index)
124
+ # save updated faiss index
125
+ save_embeddings_and_index(index)
126
+
127
+ # create conversation chain
128
+ st.session_state.conversation = get_conversation_chain(index)
129
+
130
+
131
+ if __name__ == '__main__':
132
+ main()
database_app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # from dotenv import load_dotenv
3
+ # from PyPDF2 import PdfReader
4
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ # from langchain.embeddings import HuggingFaceInstructEmbeddings
6
+ # from langchain.vectorstores import FAISS
7
+ # from langchain.memory import ConversationBufferMemory
8
+ # from langchain.chains import ConversationalRetrievalChain
9
+ # from htmlTemplates import css, bot_template, user_template
10
+ # from langchain.llms import HuggingFaceHub
11
+ # import psycopg2
12
+ # from pgvector import PGVector
13
+
14
+
15
+ # # Database connection parameters
16
+ # DB_HOST = "localhost"
17
+ # DB_PORT = "5432"
18
+ # DB_NAME = "chatbot"
19
+ # DB_USER = "admin"
20
+ # DB_PASSWORD = "admin"
21
+
22
+
23
+ # #Function to establish a database connection
24
+ # def connect_to_postgresql():
25
+ # return psycopg2.connect(
26
+ # host=DB_HOST,
27
+ # port=DB_PORT,
28
+ # database=DB_NAME,
29
+ # user=DB_USER,
30
+ # password=DB_PASSWORD
31
+ # )
32
+
33
+
34
+ # def store_embeddings_in_postgresql(text_chunks, conn):
35
+ # """Function to store embeddings in PostgreSQL using pgvector"""
36
+
37
+ # # Create a cursor
38
+ # cursor = conn.cursor()
39
+
40
+ # try:
41
+ # # Create a table if not exists
42
+ # cursor.execute("""
43
+ # CREATE TABLE IF NOT EXISTS embeddings (
44
+ # id SERIAL PRIMARY KEY,
45
+ # vector PG_VECTOR
46
+ # )
47
+ # """)
48
+
49
+ # # Insert embeddings into the table
50
+ # for text_chunk in text_chunks:
51
+ # # To store embeddings in a 'vector' column in 'embeddings' table
52
+ # cursor.execute("INSERT INTO embeddings (vector) VALUES (PG_VECTOR(%s))", (text_chunk,))
53
+
54
+ # # Commit the transaction
55
+ # conn.commit()
56
+ # st.success("Embeddings stored successfully in PostgreSQL.")
57
+ # except Exception as e:
58
+ # # Rollback in case of an error
59
+ # conn.rollback()
60
+ # st.error(f"Error storing embeddings in PostgreSQL: {str(e)}")
61
+ # finally:
62
+ # # Close the cursor
63
+ # cursor.close()
64
+
65
+
66
+ # def create_index_in_postgresql(conn):
67
+ # """Function to create an index on the stored vectors using HNSW or IVFFIT"""
68
+
69
+ # # Create a cursor
70
+ # cursor = conn.cursor()
71
+
72
+ # try:
73
+ # # Create an index if not exists
74
+ # cursor.execute("""
75
+ # CREATE INDEX IF NOT EXISTS embeddings_index
76
+ # ON embeddings
77
+ # USING ivfflat (vector)
78
+ # """)
79
+
80
+ # # Commit the transaction
81
+ # conn.commit()
82
+ # st.success("Index created successfully in PostgreSQL.")
83
+ # except Exception as e:
84
+ # # Rollback in case of an error
85
+ # conn.rollback()
86
+ # st.error(f"Error creating index in PostgreSQL: {str(e)}")
87
+ # finally:
88
+ # # Close the cursor
89
+ # cursor.close()
90
+
91
+
92
+ # def get_pdf_text(pdf):
93
+ # """Upload pdf files and extract text"""
94
+ # text = ""
95
+ # pdf_reader = PdfReader(pdf)
96
+ # for page in pdf_reader.pages:
97
+ # text += page.extract_text()
98
+ # return text
99
+
100
+
101
+ # def get_files(text_doc):
102
+ # """Upload text files and extraxt text"""
103
+ # text =""
104
+ # for file in text_doc:
105
+ # print(text)
106
+ # if file.type == "text/plain":
107
+ # # Read the text directly from the file
108
+ # text += file.getvalue().decode("utf-8")
109
+ # elif file.type == "application/pdf":
110
+ # text += get_pdf_text(file)
111
+ # return text
112
+
113
+
114
+ # def get_text_chunks(text):
115
+ # """Create chunks of the extracted text"""
116
+ # text_splitter = RecursiveCharacterTextSplitter(
117
+ # chunk_size=900,
118
+ # chunk_overlap=0,
119
+ # separators="\n",
120
+ # add_start_index = True,
121
+ # length_function= len
122
+ # )
123
+ # chunks = text_splitter.split_text(text)
124
+ # return chunks
125
+
126
+
127
+ # def get_vectorstore(text_chunks, conn):
128
+ # """Create embeddings for the chunks and store them in a vectorstore"""
129
+ # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
130
+ # vectorstore = PGVector.from_texts(texts=text_chunks, embedding=embeddings, connection=conn)
131
+ # return vectorstore
132
+
133
+
134
+ # def get_conversation_chain(vectorstore):
135
+ # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.2, "max_length":1024})
136
+
137
+ # memory = ConversationBufferMemory(
138
+ # memory_key='chat_history', return_messages=True)
139
+ # conversation_chain = ConversationalRetrievalChain.from_llm(
140
+ # llm=llm,
141
+ # retriever=vectorstore.as_retriever(),
142
+ # memory=memory
143
+ # )
144
+ # return conversation_chain
145
+
146
+
147
+ # def handle_userinput(user_question):
148
+ # response = st.session_state.conversation({'question': user_question})
149
+ # st.session_state.chat_history = response['chat_history']
150
+
151
+ # for i, message in enumerate(st.session_state.chat_history):
152
+ # if i % 2 == 0:
153
+ # st.write(user_template.replace(
154
+ # "{{MSG}}", message.content), unsafe_allow_html=True)
155
+ # else:
156
+ # st.write(bot_template.replace(
157
+ # "{{MSG}}", message.content), unsafe_allow_html=True)
158
+
159
+
160
+ # def main():
161
+ # load_dotenv()
162
+ # st.set_page_config(page_title="ChatBot")
163
+ # st.write(css, unsafe_allow_html=True)
164
+
165
+ # if "conversation" not in st.session_state:
166
+ # st.session_state.conversation = None
167
+ # if "chat_history" not in st.session_state:
168
+ # st.session_state.chat_history = None
169
+
170
+ # # Connect to PostgreSQL
171
+ # conn = connect_to_postgresql()
172
+
173
+ # st.header("Chat Bot")
174
+ # user_question = st.text_input("Ask a question:")
175
+ # if user_question:
176
+ # handle_userinput(user_question, conn)
177
+
178
+ # with st.sidebar:
179
+ # st.subheader("Your documents")
180
+ # pdf_docs = st.file_uploader(
181
+ # "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
182
+ # if st.button("Process"):
183
+ # with st.spinner("Processing"):
184
+ # # get text
185
+ # raw_text = get_files(pdf_docs)
186
+
187
+ # # get the text chunks
188
+ # text_chunks = get_text_chunks(raw_text)
189
+
190
+ # # store embeddings in PostgreSQL
191
+ # store_embeddings_in_postgresql(text_chunks, conn)
192
+
193
+ # # create vector store
194
+ # vectorstore = get_vectorstore(text_chunks, conn)
195
+
196
+ # # create index in PostgreSQL
197
+ # create_index_in_postgresql(conn)
198
+
199
+ # # create conversation chain
200
+ # st.session_state.conversation = get_conversation_chain(
201
+ # vectorstore)
202
+
203
+ # if __name__ == '__main__':
204
+ # main()
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ