ngmitam commited on
Commit
c740382
·
1 Parent(s): ebe5f88

First commit

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +100 -0
  3. requirements.txt +206 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ .vscode
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.vectorstores import FAISS
6
+ from langchain.chat_models import ChatOpenAI
7
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ from langchain.llms import GPT4All
11
+ from streamlit_chat import message
12
+ from huggingface_hub import hf_hub_download
13
+
14
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
15
+
16
+
17
+ def get_pdf_text(pdfs):
18
+ text = ""
19
+ for pdf in pdfs:
20
+ pdf_reader = PdfReader(pdf)
21
+ for page in pdf_reader.pages:
22
+ text += page.extract_text()
23
+ return text
24
+
25
+
26
+ def get_text_chunks(text):
27
+ text_splitter = CharacterTextSplitter(separator="\n",
28
+ chunk_size=1000, chunk_overlap=200, length_function=len)
29
+ chunks = text_splitter.split_text(text)
30
+ return chunks
31
+
32
+
33
+ def get_vectorstore(text_chunks):
34
+ # embeddings = OpenAIEmbeddings()
35
+ embeddings = HuggingFaceEmbeddings(
36
+ model_name="all-MiniLM-L6-v2")
37
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
38
+ return vectorstore
39
+
40
+
41
+ def get_conversation_chain(vectorstore):
42
+ callbacks = [StreamingStdOutCallbackHandler()]
43
+ llm = GPT4All(model="/tmp/ggml-gpt4all-j-v1.3-groovy.bin",
44
+ max_tokens=1000, backend='gptj', callbacks=callbacks, n_batch=8, verbose=False)
45
+ # llm = ChatOpenAI()
46
+ memory = ConversationBufferMemory(
47
+ memory_key='chat_history', return_messages=True)
48
+ conversation_chain = ConversationalRetrievalChain.from_llm(
49
+ llm=llm,
50
+ retriever=vectorstore.as_retriever(),
51
+ memory=memory
52
+
53
+ )
54
+ return conversation_chain
55
+
56
+
57
+ def user_input(user_question):
58
+ print(user_question)
59
+ response = st.session_state.conversation({'question': user_question})
60
+ print(response)
61
+ st.session_state.chat_history = response['chat_history']
62
+ for i, messages in enumerate(st.session_state.chat_history):
63
+ if i % 2 == 0:
64
+ message(messages.content, is_user=True)
65
+ else:
66
+ message(messages.content)
67
+
68
+
69
+ def main():
70
+ load_dotenv()
71
+ hf_hub_download(repo_id="dnato/ggml-gpt4all-j-v1.3-groovy.bin",
72
+ filename="ggml-gpt4all-j-v1.3-groovy.bin", local_dir="/tmp")
73
+ st.set_page_config(page_title="Chat with PDF")
74
+ if "conversation" not in st.session_state:
75
+ st.session_state.conversation = None
76
+ if "chat_history" not in st.session_state:
77
+ st.session_state.chat_history = None
78
+
79
+ st.header("Chat with PDF")
80
+ user_question = st.text_input("Ask a question about your documents...")
81
+ if user_question:
82
+ user_input(user_question)
83
+ with st.sidebar:
84
+ st.subheader("Your Documents")
85
+ pdfs = st.file_uploader("Upload here", accept_multiple_files=True)
86
+ if st.button("Process"):
87
+ with st.spinner("Processing"):
88
+ raw_text = get_pdf_text(pdfs)
89
+ # print(raw_text)
90
+ chunks = get_text_chunks(raw_text)
91
+ # print(chunks)
92
+ vectorstore = get_vectorstore(chunks)
93
+ # print(vectorstore)
94
+ st.session_state.conversation = get_conversation_chain(
95
+ vectorstore)
96
+ st.success("Processing Complete !")
97
+
98
+
99
+ if __name__ == '__main__':
100
+ main()
requirements.txt ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.5
2
+ aiosignal==1.3.1
3
+ altair==5.0.1
4
+ anyio==3.7.1
5
+ appnope==0.1.3
6
+ argilla==1.1.1
7
+ astroid==2.15.5
8
+ asttokens==2.2.1
9
+ async-timeout==4.0.2
10
+ attrs==23.1.0
11
+ backcall==0.2.0
12
+ backoff==2.2.1
13
+ beautifulsoup4==4.12.2
14
+ blinker==1.6.2
15
+ cachetools==5.3.1
16
+ certifi==2023.5.7
17
+ cffi==1.15.1
18
+ chardet==5.2.0
19
+ charset-normalizer==3.1.0
20
+ chromadb==0.3.26
21
+ click==8.1.6
22
+ clickhouse-connect==0.6.8
23
+ colorclass==2.2.2
24
+ coloredlogs==15.0.1
25
+ comm==0.1.3
26
+ compressed-rtf==1.0.6
27
+ contourpy==1.1.0
28
+ cryptography==41.0.3
29
+ cycler==0.11.0
30
+ Cython==0.29.35
31
+ dataclasses-json==0.5.14
32
+ debugpy==1.6.7
33
+ decorator==5.1.1
34
+ Deprecated==1.2.14
35
+ dill==0.3.6
36
+ diskcache==5.6.1
37
+ duckdb==0.8.1
38
+ easygui==0.98.3
39
+ ebcdic==1.1.1
40
+ et-xmlfile==1.1.0
41
+ executing==1.2.0
42
+ extract-msg==0.41.5
43
+ faiss-cpu==1.7.4
44
+ fastapi==0.101.0
45
+ filelock==3.12.2
46
+ filetype==1.2.0
47
+ flatbuffers==23.5.26
48
+ fonttools==4.40.0
49
+ frozenlist==1.4.0
50
+ fsspec==2023.6.0
51
+ gitdb==4.0.10
52
+ GitPython==3.1.32
53
+ gpt4all==1.0.3
54
+ h11==0.9.0
55
+ hnswlib==0.7.0
56
+ httpcore==0.11.1
57
+ httptools==0.6.0
58
+ httpx==0.15.5
59
+ huggingface-hub==0.16.4
60
+ humanfriendly==10.0
61
+ idna==3.4
62
+ IMAPClient==2.3.1
63
+ importlib-metadata==6.8.0
64
+ InstructorEmbedding==1.0.1
65
+ ipykernel==6.23.3
66
+ ipython==8.14.0
67
+ isort==5.12.0
68
+ jedi==0.18.2
69
+ Jinja2==3.1.2
70
+ joblib==1.3.1
71
+ jsonschema==4.19.0
72
+ jsonschema-specifications==2023.7.1
73
+ jupyter_client==8.3.0
74
+ jupyter_core==5.3.1
75
+ kiwisolver==1.4.4
76
+ langchain==0.0.228
77
+ langchainplus-sdk==0.0.20
78
+ lark-parser==0.12.0
79
+ lazy-object-proxy==1.9.0
80
+ llama-cpp-python==0.1.68
81
+ lxml==4.9.3
82
+ lz4==4.3.2
83
+ Markdown==3.4.4
84
+ markdown-it-py==3.0.0
85
+ MarkupSafe==2.1.3
86
+ marshmallow==3.20.1
87
+ matplotlib==3.7.1
88
+ matplotlib-inline==0.1.6
89
+ mccabe==0.7.0
90
+ mdurl==0.1.2
91
+ monotonic==1.6
92
+ mpmath==1.3.0
93
+ msg-parser==1.2.0
94
+ msoffcrypto-tool==5.1.1
95
+ multidict==6.0.4
96
+ mypy-extensions==1.0.0
97
+ nest-asyncio==1.5.6
98
+ networkx==3.1
99
+ nltk==3.8.1
100
+ numexpr==2.8.5
101
+ numpy==1.25.0
102
+ olefile==0.46
103
+ oletools==0.60.1
104
+ onnxruntime==1.15.1
105
+ openai==0.27.8
106
+ openapi-schema-pydantic==1.2.4
107
+ openpyxl==3.1.2
108
+ overrides==7.4.0
109
+ packaging==23.1
110
+ pandas==1.5.3
111
+ pandoc==2.3
112
+ parso==0.8.3
113
+ pcodedmp==1.2.6
114
+ pdf2image==1.16.3
115
+ pdfminer.six==20221105
116
+ pexpect==4.8.0
117
+ pickleshare==0.7.5
118
+ Pillow==9.5.0
119
+ platformdirs==3.8.0
120
+ plumbum==1.8.2
121
+ ply==3.11
122
+ posthog==3.0.1
123
+ prompt-toolkit==3.0.38
124
+ protobuf==4.23.4
125
+ psutil==5.9.5
126
+ ptyprocess==0.7.0
127
+ pulsar-client==3.2.0
128
+ pure-eval==0.2.2
129
+ pyarrow==12.0.1
130
+ pycocotools @ git+https://github.com/leimao/cocoapi.git@8c9bcc3cf640524c4c20a9c40e89cb6a2f2fa0e9#subdirectory=PythonAPI
131
+ pycparser==2.21
132
+ pycryptodome==3.15.0
133
+ pydantic==1.10.12
134
+ pydeck==0.8.0
135
+ Pygments==2.15.1
136
+ pylint==2.17.4
137
+ Pympler==1.0.1
138
+ PyMuPDF==1.22.5
139
+ pypandoc==1.11
140
+ pyparsing==2.4.7
141
+ PyPDF2==3.0.1
142
+ python-dateutil==2.8.2
143
+ python-docx==0.8.11
144
+ python-dotenv==1.0.0
145
+ python-magic==0.4.27
146
+ python-pptx==0.6.21
147
+ pytz==2023.3
148
+ pytz-deprecation-shim==0.1.0.post0
149
+ pywatchman==1.4.1
150
+ PyYAML==6.0.1
151
+ pyzmq==25.1.0
152
+ red-black-tree-mod==1.20
153
+ referencing==0.30.2
154
+ regex==2023.6.3
155
+ requests==2.31.0
156
+ rfc3986==1.5.0
157
+ rich==13.5.2
158
+ rpds-py==0.9.2
159
+ RTFDE==0.0.2
160
+ safetensors==0.3.1
161
+ scikit-learn==1.3.0
162
+ scipy==1.11.1
163
+ sentence-transformers==2.2.2
164
+ sentencepiece==0.1.99
165
+ six==1.16.0
166
+ smmap==5.0.0
167
+ sniffio==1.3.0
168
+ soupsieve==2.4.1
169
+ SQLAlchemy==2.0.19
170
+ stack-data==0.6.2
171
+ starlette==0.27.0
172
+ streamlit==1.24.0
173
+ streamlit-chat==0.1.1
174
+ sympy==1.12
175
+ tabulate==0.9.0
176
+ tenacity==8.2.2
177
+ threadpoolctl==3.2.0
178
+ tiktoken==0.4.0
179
+ tokenizers==0.13.3
180
+ toml==0.10.2
181
+ tomlkit==0.11.8
182
+ toolz==0.12.0
183
+ torch==2.0.1
184
+ torchvision==0.15.2
185
+ tornado==6.3.2
186
+ tqdm==4.65.0
187
+ traitlets==5.9.0
188
+ transformers==4.31.0
189
+ typing-inspect==0.9.0
190
+ typing_extensions==4.7.1
191
+ tzdata==2023.3
192
+ tzlocal==4.3.1
193
+ unstructured==0.8.0
194
+ urllib3==2.0.3
195
+ uvicorn==0.23.2
196
+ uvloop==0.17.0
197
+ validators==0.21.2
198
+ watchfiles==0.19.0
199
+ wcwidth==0.2.6
200
+ websockets==11.0.3
201
+ wrapt==1.13.3
202
+ xlrd==2.0.1
203
+ XlsxWriter==3.1.2
204
+ yarl==1.9.2
205
+ zipp==3.16.2
206
+ zstandard==0.21.0