yogjoshi14 commited on
Commit
3b1a154
·
0 Parent(s):

making workflows

Browse files
Files changed (4) hide show
  1. PDF_chat.png +0 -0
  2. README.md +40 -0
  3. app.py +142 -0
  4. requirements.txt +12 -0
PDF_chat.png ADDED
README.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Chat With Documents
3
+ emoji: 🦀
4
+ colorFrom: green
5
+ colorTo: red
6
+ sdk: streamlit
7
+ sdk_version: 1.31.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # qp-ai-assessment
13
+ Contextual Chat Bot
14
+
15
+ Simple Contextual Chat Bot
16
+ 1. Read a long PDF/ Word Document.
17
+ 2. Build a chat bot that will use the document as a context to answer the question.
18
+ 3. If the answer is not found in the document - it should say I don't know the answer.
19
+
20
+ Advanced Challenge:
21
+ - Break down the document into multiple chunks/ paragraphs.
22
+ - Store them in a vector database like pinecone.
23
+ - When you ask a question find out the top 3 chunks that will likely have the answer to the question using semantic similarity search.
24
+
25
+ #**System Design**
26
+
27
+ ![Architecture](https://raw.githubusercontent.com/YogJoshi14/qp-ai-assessment/main/PDF_chat.png)
28
+
29
+ #**Required Packages**
30
+ 1. Langchain : LangChain is a framework for developing applications powered by language models. [Docs](https://python.langchain.com/docs/get_started/introduction)
31
+ 2. Pinecone : Pinecone makes it easy to provide long-term memory for high-performance AI applications. It’s a managed, cloud-native vector database with a simple API and no infrastructure hassles. Pinecone serves fresh, filtered query results with low latency at the scale of billions of vectors. [Docs](https://docs.pinecone.io/docs/quickstart)
32
+ 3. Sentence_transformers : SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings. The initial work is described in our paper Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. [Docs](https://www.sbert.net/)
33
+ 4. pdf2image : pdf2image is a python module that wraps the pdftoppm and pdftocairo utilities to convert PDF into images. [Docs](https://pdf2image.readthedocs.io/en/latest/overview.html)
34
+ 5. pypdf2 : PyPDF2 is a free and open source pure-python PDF library capable of splitting, merging, cropping, and transforming the pages of PDF files. It can also add custom data, viewing options, and passwords to PDF files. PyPDF2 can retrieve text and metadata from PDFs as well.[Docs](https://pdf2image.readthedocs.io/en/latest/overview.html)
35
+ 6. transformers : Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. [Docs](https://huggingface.co/docs/transformers/en/index)
36
+
37
+ #**Limitations**
38
+ 1. Embedding : As the project has made use of readily available huggingface embeddings, it has max dimension of 768. We can make use of alternate embeddings such as HuggingFaceInstructEmbeddings, Ollama embeddings which are open-source or OpenAI embeddings.
39
+ 2. LLM : Making use of llm which has more parameter and was trained more data can also provide optimal results.
40
+
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pinecone
3
+
4
+ from langchain.chains import RetrievalQA, ConversationalRetrievalChain
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain_community.llms import HuggingFaceHub
7
+
8
+ from PyPDF2 import PdfReader
9
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
10
+ from langchain_community.vectorstores import Pinecone
11
+ from langchain_community.chat_message_histories import StreamlitChatMessageHistory
12
+
13
+ import streamlit as st
14
+
15
+
16
+ st.set_page_config(page_title="chatbot")
17
+ st.title("Chat with Documents")
18
+
19
+
20
+ num_of_top_selection = 3
21
+ CHUNK_SIZE = 500
22
+ CHUNK_OVERLAP = 50
23
+ embedding_dim = 768
24
+
25
+ # Initialize Pinecone
26
+ pc = pinecone.Pinecone(api_key=os.environ.getattribute("PINECONE_API_KEY"))
27
+ index_name = "qp-ai-assessment"
28
+
29
+
30
+ def recreate_index():
31
+ # Check if the index exists, and delete it if it does
32
+ existing_indexes = pc.list_indexes().names()
33
+ print(existing_indexes)
34
+ if index_name in existing_indexes:
35
+ pc.delete_index(index_name)
36
+ print(f"Deleted existing index: {index_name}")
37
+
38
+ # Create a new index
39
+ pc.create_index(
40
+ name=index_name,
41
+ metric='cosine',
42
+ dimension=embedding_dim,
43
+ spec=pinecone.PodSpec(os.environ.getattribute("PINECONE_ENV")) # 1536 dim of text-embedding-ada-002
44
+ )
45
+ print(f"Created new index: {index_name}")
46
+
47
+ def load_documents(pdf_docs):
48
+ text = ""
49
+ for pdf in pdf_docs:
50
+ pdf_reader = PdfReader(pdf)
51
+ for page in pdf_reader.pages:
52
+ text += page.extract_text()
53
+ return text
54
+
55
+
56
+ def split_documents(documents):
57
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
58
+ texts = text_splitter.split_text(documents)
59
+ return text_splitter.create_documents(texts)
60
+
61
+
62
+ def embeddings_on_pinecone(texts):
63
+ # Use HuggingFace embeddings for transforming text into numerical vectors
64
+ embeddings = HuggingFaceEmbeddings()
65
+ vectordb = Pinecone.from_documents(texts, embeddings, index_name=st.session_state.pinecone_index)
66
+ retriever = vectordb.as_retriever(search_kwargs={'k': num_of_top_selection})
67
+ return retriever
68
+
69
+ def query_llm(retriever, query):
70
+ #llm = OpenAIChat(openai_api_key=st.session_state.openai_api_key)
71
+ llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
72
+ qa_chain = ConversationalRetrievalChain.from_llm(
73
+ llm=llm,
74
+ retriever=retriever,
75
+ return_source_documents=True,
76
+ )
77
+ result = qa_chain({'question': query, 'chat_history': st.session_state.messages})
78
+ result = result['answer']
79
+ st.session_state.messages.append((query, result))
80
+ return result
81
+
82
+ def input_fields():
83
+ #
84
+ with st.sidebar:
85
+ #
86
+ # if "openai_api_key" in st.secrets:
87
+ # st.session_state.openai_api_key = st.secrets.openai_api_key
88
+ # else:
89
+ # st.session_state.openai_api_key = st.text_input("OpenAI API key", type="password")
90
+
91
+ st.session_state.pinecone_api_key = os.environ.getattribute("PINECONE_API_KEY")
92
+ # st.text_input("Pinecone API key", type="password")
93
+ st.session_state.pinecone_env = os.environ.getattribute("PINECONE_ENV")
94
+ # st.text_input("Pinecone environment")
95
+ st.session_state.pinecone_index = index_name
96
+ # st.text_input("Pinecone index name")
97
+ st.session_state.source_docs = st.file_uploader(label="Upload Documents", type="pdf", accept_multiple_files=True)
98
+ #
99
+
100
+
101
+ def process_documents():
102
+
103
+ if not st.session_state.pinecone_api_key or not st.session_state.pinecone_env or not st.session_state.pinecone_index or not st.session_state.source_docs:
104
+ st.warning(f"Please upload the documents and provide the missing fields.")
105
+ else:
106
+ try:
107
+ # for source_doc in st.session_state.source_docs:
108
+ if st.session_state.source_docs:
109
+ #
110
+ # recreate_index()
111
+
112
+ documents = load_documents(st.session_state.source_docs)
113
+
114
+ #
115
+ texts = split_documents(documents)
116
+ #
117
+ st.session_state.retriever = embeddings_on_pinecone(texts)
118
+ except Exception as e:
119
+ st.error(f"An error occurred: {e}")
120
+
121
+ def boot():
122
+ #
123
+ input_fields()
124
+ #
125
+ st.button("Submit Documents", on_click=process_documents)
126
+ #
127
+ if "messages" not in st.session_state:
128
+ st.session_state.messages = []
129
+ #
130
+ for message in st.session_state.messages:
131
+ st.chat_message('human').write(message[0])
132
+ st.chat_message('ai').write(message[1])
133
+ #
134
+ if query := st.chat_input():
135
+ st.chat_message("human").write(query)
136
+ response = query_llm(st.session_state.retriever, query)
137
+ st.chat_message("ai").write(response)
138
+
139
+ if __name__ == '__main__':
140
+ #
141
+ boot()
142
+
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ PyPDF2
4
+ streamlit
5
+ tiktoken
6
+ pinecone-client
7
+ sentence-transformers==2.2.2
8
+ accelerate
9
+ transformers
10
+ huggingface-hub
11
+ python-docx
12
+ textract