Spaces:

ChaitanyaFM
/

chatbot

Runtime error

App Files Files Community

ChaitanyaFM commited on Dec 18, 2023

Commit

060c9d8

1 Parent(s): 3a90b59

Created index file to store the indices

Browse files

Files changed (4) hide show

__pycache__/htmlTemplates.cpython-311.pyc +0 -0
app.py +132 -0
database_app.py +204 -0
requirements.txt +0 -0

__pycache__/htmlTemplates.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/htmlTemplates.cpython-311.pyc and b/__pycache__/htmlTemplates.cpython-311.pyc differ

app.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import streamlit as st
+from dotenv import load_dotenv
+from PyPDF2 import PdfReader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import HuggingFaceInstructEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+from htmlTemplates import css, bot_template, user_template
+from langchain.llms import HuggingFaceHub
+import os
+import numpy as np
+# EMBEDDINGS_FILE = "embeddings.npy"
+INDEX_FILE = "index.faiss"
+def save_embeddings_and_index(index):
+    # np.save(EMBEDDINGS_FILE, embeddings)
+    index.save_local(INDEX_FILE)
+def load_embeddings_and_index():
+    if os.path.exists(INDEX_FILE):
+        embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
+        index = FAISS.load_local(INDEX_FILE, embeddings)
+        return index
+    return None
+def get_pdf_text(pdf):
+    text = ""
+    pdf_reader = PdfReader(pdf)
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    return text
+def get_files(text_doc):
+    text = ""
+    for file in text_doc:
+        if file.type == "text/plain":
+            # Read the text directly from the file
+            text += file.getvalue().decode("utf-8")
+        elif file.type == "application/pdf":
+            text += get_pdf_text(file)
+    return text
+def get_text_chunks(text):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=900,
+        chunk_overlap=0,
+        separators="\n",
+        add_start_index = True,
+        length_function= len
+    )
+    chunks = text_splitter.split_text(text)
+    return chunks
+def get_vectorstore(text_chunks, index):
+    if index is None:
+        embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
+        vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
+        return vectorstore
+    else:
+        index.add_texts(texts=text_chunks)
+        return index
+def get_conversation_chain(vectorstore):
+    llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.2, "max_length":1024})
+    memory = ConversationBufferMemory(
+        memory_key='chat_history', return_messages=True)
+    conversation_chain = ConversationalRetrievalChain.from_llm(
+        llm=llm,
+        retriever=vectorstore.as_retriever(),
+        memory=memory
+    )
+    return conversation_chain
+def handle_userinput(user_question):
+    response = st.session_state.conversation({'question': user_question})
+    st.session_state.chat_history = response['chat_history']
+    for i, message in enumerate(st.session_state.chat_history):
+        if i % 2 == 0:
+            st.write(user_template.replace(
+                "{{MSG}}", message.content), unsafe_allow_html=True)
+        else:
+            st.write(bot_template.replace(
+                "{{MSG}}", message.content), unsafe_allow_html=True)
+def main():
+    load_dotenv()
+    st.set_page_config(page_title="ChatBot")
+    st.write(css, unsafe_allow_html=True)
+    if "conversation" not in st.session_state:
+        index = load_embeddings_and_index()
+        st.session_state.conversation = get_conversation_chain(index)
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = None
+    st.header("Chat Bot")
+    user_question = st.text_input("Ask a question:")
+    if user_question:
+        handle_userinput(user_question)
+    with st.sidebar:
+        st.subheader("Your documents")
+        pdf_docs = st.file_uploader(
+            "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
+        if st.button("Process"):
+            with st.spinner("Processing"):
+                index = load_embeddings_and_index()
+                raw_text = get_files(pdf_docs)
+                text_chunks = get_text_chunks(raw_text)
+                # Load a new faiss index or append to existing (if it exists)
+                index = get_vectorstore(text_chunks, index)
+                # save updated faiss index
+                save_embeddings_and_index(index)
+                # create conversation chain
+                st.session_state.conversation = get_conversation_chain(index)
+if __name__ == '__main__':
+    main()

database_app.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# import streamlit as st
+# from dotenv import load_dotenv
+# from PyPDF2 import PdfReader
+# from langchain.text_splitter import RecursiveCharacterTextSplitter
+# from langchain.embeddings import HuggingFaceInstructEmbeddings
+# from langchain.vectorstores import FAISS
+# from langchain.memory import ConversationBufferMemory
+# from langchain.chains import ConversationalRetrievalChain
+# from htmlTemplates import css, bot_template, user_template
+# from langchain.llms import HuggingFaceHub
+# import psycopg2
+# from pgvector import PGVector
+# # Database connection parameters
+# DB_HOST = "localhost"
+# DB_PORT = "5432"
+# DB_NAME = "chatbot"
+# DB_USER = "admin"
+# DB_PASSWORD = "admin"
+# #Function to establish a database connection
+# def connect_to_postgresql():
+#     return psycopg2.connect(
+#         host=DB_HOST,
+#         port=DB_PORT,
+#         database=DB_NAME,
+#         user=DB_USER,
+#         password=DB_PASSWORD
+#     )
+# def store_embeddings_in_postgresql(text_chunks, conn):
+#     """Function to store embeddings in PostgreSQL using pgvector"""
+#     # Create a cursor
+#     cursor = conn.cursor()
+#     try:
+#         # Create a table if not exists
+#         cursor.execute("""
+#             CREATE TABLE IF NOT EXISTS embeddings (
+#                 id SERIAL PRIMARY KEY,
+#                 vector PG_VECTOR
+#             )
+#         """)
+#         # Insert embeddings into the table
+#         for text_chunk in text_chunks:
+#             # To store embeddings in a 'vector' column in 'embeddings' table
+#             cursor.execute("INSERT INTO embeddings (vector) VALUES (PG_VECTOR(%s))", (text_chunk,))
+#         # Commit the transaction
+#         conn.commit()
+#         st.success("Embeddings stored successfully in PostgreSQL.")
+#     except Exception as e:
+#         # Rollback in case of an error
+#         conn.rollback()
+#         st.error(f"Error storing embeddings in PostgreSQL: {str(e)}")
+#     finally:
+#         # Close the cursor
+#         cursor.close()
+# def create_index_in_postgresql(conn):
+#     """Function to create an index on the stored vectors using HNSW or IVFFIT"""
+#     # Create a cursor
+#     cursor = conn.cursor()
+#     try:
+#         # Create an index if not exists
+#         cursor.execute("""
+#             CREATE INDEX IF NOT EXISTS embeddings_index
+#             ON embeddings
+#             USING ivfflat (vector)
+#         """)
+#         # Commit the transaction
+#         conn.commit()
+#         st.success("Index created successfully in PostgreSQL.")
+#     except Exception as e:
+#         # Rollback in case of an error
+#         conn.rollback()
+#         st.error(f"Error creating index in PostgreSQL: {str(e)}")
+#     finally:
+#         # Close the cursor
+#         cursor.close()
+# def get_pdf_text(pdf):
+#     """Upload pdf files and extract text"""
+#     text = ""
+#     pdf_reader = PdfReader(pdf)
+#     for page in pdf_reader.pages:
+#         text += page.extract_text()
+#     return text
+# def get_files(text_doc):
+#     """Upload text files and extraxt text"""
+#     text =""
+#     for file in text_doc:
+#         print(text)
+#         if file.type == "text/plain":
+#             # Read the text directly from the file
+#             text += file.getvalue().decode("utf-8")
+#         elif file.type == "application/pdf":
+#             text += get_pdf_text(file)
+#     return text
+# def get_text_chunks(text):
+#     """Create chunks of the extracted text"""
+#     text_splitter = RecursiveCharacterTextSplitter(
+#         chunk_size=900,
+#         chunk_overlap=0,
+#         separators="\n",
+#         add_start_index = True,
+#         length_function= len
+#     )
+#     chunks = text_splitter.split_text(text)
+#     return chunks
+# def get_vectorstore(text_chunks, conn):
+#     """Create embeddings for the chunks and store them in a vectorstore"""
+#     embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
+#     vectorstore = PGVector.from_texts(texts=text_chunks, embedding=embeddings, connection=conn)
+#     return vectorstore
+# def get_conversation_chain(vectorstore):
+#     llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.2, "max_length":1024})
+#     memory = ConversationBufferMemory(
+#         memory_key='chat_history', return_messages=True)
+#     conversation_chain = ConversationalRetrievalChain.from_llm(
+#         llm=llm,
+#         retriever=vectorstore.as_retriever(),
+#         memory=memory
+#     )
+#     return conversation_chain
+# def handle_userinput(user_question):
+#     response = st.session_state.conversation({'question': user_question})
+#     st.session_state.chat_history = response['chat_history']
+#     for i, message in enumerate(st.session_state.chat_history):
+#         if i % 2 == 0:
+#             st.write(user_template.replace(
+#                 "{{MSG}}", message.content), unsafe_allow_html=True)
+#         else:
+#             st.write(bot_template.replace(
+#                 "{{MSG}}", message.content), unsafe_allow_html=True)
+# def main():
+#     load_dotenv()
+#     st.set_page_config(page_title="ChatBot")
+#     st.write(css, unsafe_allow_html=True)
+#     if "conversation" not in st.session_state:
+#         st.session_state.conversation = None
+#     if "chat_history" not in st.session_state:
+#         st.session_state.chat_history = None
+#     # Connect to PostgreSQL
+#     conn = connect_to_postgresql()
+#     st.header("Chat Bot")
+#     user_question = st.text_input("Ask a question:")
+#     if user_question:
+#         handle_userinput(user_question, conn)
+#     with st.sidebar:
+#         st.subheader("Your documents")
+#         pdf_docs = st.file_uploader(
+#             "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
+#         if st.button("Process"):
+#             with st.spinner("Processing"):
+#                 # get text
+#                 raw_text = get_files(pdf_docs)
+#                 # get the text chunks
+#                 text_chunks = get_text_chunks(raw_text)
+#                 # store embeddings in PostgreSQL
+#                 store_embeddings_in_postgresql(text_chunks, conn)
+#                 # create vector store
+#                 vectorstore = get_vectorstore(text_chunks, conn)
+#                 # create index in PostgreSQL
+#                 create_index_in_postgresql(conn)
+#                 # create conversation chain
+#                 st.session_state.conversation = get_conversation_chain(
+#                     vectorstore)
+# if __name__ == '__main__':
+#     main()

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ