polaris404 commited on
Commit
2421315
·
verified ·
1 Parent(s): f468738

Files Added

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ vectordb/The-100-Page-Machine-Learning-Book/index.faiss filter=lfs diff=lfs merge=lfs -text
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [server]
2
+ enableXsrfProtection = false
3
+ enableCORS = false
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.embeddings.openai import OpenAIEmbeddings
4
+ from langchain_community.llms import OpenAI
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_community.callbacks import get_openai_callback
7
+ from langchain.chains.question_answering import load_qa_chain
8
+
9
+ import streamlit as st
10
+
11
+ from PyPDF2 import PdfReader
12
+
13
+ from dotenv import load_dotenv
14
+ load_dotenv()
15
+
16
+ import re
17
+
18
+ ##! Converting PDF to text ##
19
+ # def read_pdf(pdf):
20
+ # pdf_reader = PdfReader(pdf)
21
+ # text = ""
22
+ # for page in pdf_reader.pages:
23
+ # text += page.extract_text()
24
+ # return text
25
+
26
+ ##! Saving vectorestore locally ##
27
+ # def save_vectorstore(title, vector_store):
28
+ # title = re.sub('\s+', '-', title)
29
+ # title = re.sub('[^a-zA-Z0-9\-]+', '', title)
30
+ # vector_store.save_local(f"./vectordb/{title[:35]}")
31
+ # print(f"{title} stored!")
32
+ # return True
33
+
34
+ ##* Load vectorstore ##
35
+ def load_vectorstore(name, embeddings):
36
+ vector_store = FAISS.load_local(f"./vectordb/{name}/", embeddings=embeddings, allow_dangerous_deserialization=True)
37
+ return vector_store
38
+
39
+ def main():
40
+ st.header('Chat with PDF 💬')
41
+
42
+ # pdf = st.file_uploader("Upload PDF", type='pdf')
43
+
44
+
45
+ embeddings = OpenAIEmbeddings()
46
+ vector_store = ''
47
+
48
+ if 'clicked' not in st.session_state:
49
+ st.session_state.clicked = False
50
+
51
+ def click_button():
52
+ st.session_state.clicked = True
53
+
54
+ # st.button('Load', on_click=click_button)
55
+ with st.container(border=True):
56
+ st.markdown('''
57
+ *Disclaimer: Section for uploading the PDF file has been removed as the API calls for OpenAI are not free. I've included few pdfs for Q&A. You can access the source code and enable the section for uploading PDFs.*
58
+ ''')
59
+
60
+ ##! Converting text to word Embeddings ##
61
+ # if st.session_state.clicked:
62
+ # if pdf is not None:
63
+ # text = read_pdf(pdf)
64
+ # text_splitter = RecursiveCharacterTextSplitter(
65
+ # chunk_size=1000,
66
+ # chunk_overlap=200,
67
+ # length_function=len
68
+ # )
69
+ # chunks = text_splitter.split_text(text=text)
70
+ # vector_store = FAISS.from_texts(chunks, embedding=embeddings)
71
+ # if save_btn:
72
+ # save_vectorstore(pdf.name[:-4], vector_store)
73
+
74
+
75
+
76
+ option = st.selectbox(label="Select the PDF: ", options=['Budget Speech 2024', 'The 100 Page Machine Learning Book'], index=None)
77
+
78
+ query = st.text_input("Ask questions from your PDF file:")
79
+
80
+ if query and not option:
81
+ st.warning("Please Select a PDF")
82
+
83
+ if option:
84
+ title = re.sub('\s+', '-', option)
85
+ title = re.sub('[^a-zA-Z0-9\-]+', '', title)
86
+ vector_store = load_vectorstore(title, embeddings)
87
+
88
+ if query:
89
+ docs = vector_store.similarity_search(query=query, k=3)
90
+ llm = OpenAI(temperature=0)
91
+ chain = load_qa_chain(llm=llm, chain_type="stuff")
92
+ with get_openai_callback() as cb:
93
+ response = chain.invoke({"input_documents": docs, "question":query})
94
+ print(cb)
95
+ st.write(response["output_text"])
96
+
97
+
98
+ if __name__ == '__main__':
99
+ main()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ PyPDF2
3
+ langchain
4
+ langchain_community
5
+ openai
6
+ tiktoken
7
+ faiss-cpu
8
+ python-dotenv
vectordb/Budget-Speech-2024/index.faiss ADDED
Binary file (277 kB). View file
 
vectordb/Budget-Speech-2024/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f26f0949132ab2b73881fa767ffdb7943c53b38c1a3f7c8ab548b1d7e7e77280
3
+ size 48844
vectordb/The-100-Page-Machine-Learning-Book/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6fecddc1fa588c31bdb8a707349a831503e609d89d5b0ade3a9007b03d56f3f
3
+ size 2150445
vectordb/The-100-Page-Machine-Learning-Book/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ce1dc384be60be0ce9b6489f2892c15e734d73c43b9b5b6878080611a39459f
3
+ size 379563
vectordb/test/index.faiss ADDED
Binary file (135 kB). View file
 
vectordb/test/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de9e7792ae26942b5b47586eac60a9e8339b598aacf5c404e15b6a03c5deac6b
3
+ size 23798