Spaces:
Sleeping
Sleeping
Files Added
Browse files- .gitattributes +1 -0
- .streamlit/config.toml +3 -0
- app.py +99 -0
- requirements.txt +8 -0
- vectordb/Budget-Speech-2024/index.faiss +0 -0
- vectordb/Budget-Speech-2024/index.pkl +3 -0
- vectordb/The-100-Page-Machine-Learning-Book/index.faiss +3 -0
- vectordb/The-100-Page-Machine-Learning-Book/index.pkl +3 -0
- vectordb/test/index.faiss +0 -0
- vectordb/test/index.pkl +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
vectordb/The-100-Page-Machine-Learning-Book/index.faiss filter=lfs diff=lfs merge=lfs -text
|
.streamlit/config.toml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[server]
|
2 |
+
enableXsrfProtection = false
|
3 |
+
enableCORS = false
|
app.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
4 |
+
from langchain_community.llms import OpenAI
|
5 |
+
from langchain_community.vectorstores import FAISS
|
6 |
+
from langchain_community.callbacks import get_openai_callback
|
7 |
+
from langchain.chains.question_answering import load_qa_chain
|
8 |
+
|
9 |
+
import streamlit as st
|
10 |
+
|
11 |
+
from PyPDF2 import PdfReader
|
12 |
+
|
13 |
+
from dotenv import load_dotenv
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
+
import re
|
17 |
+
|
18 |
+
##! Converting PDF to text ##
|
19 |
+
# def read_pdf(pdf):
|
20 |
+
# pdf_reader = PdfReader(pdf)
|
21 |
+
# text = ""
|
22 |
+
# for page in pdf_reader.pages:
|
23 |
+
# text += page.extract_text()
|
24 |
+
# return text
|
25 |
+
|
26 |
+
##! Saving vectorestore locally ##
|
27 |
+
# def save_vectorstore(title, vector_store):
|
28 |
+
# title = re.sub('\s+', '-', title)
|
29 |
+
# title = re.sub('[^a-zA-Z0-9\-]+', '', title)
|
30 |
+
# vector_store.save_local(f"./vectordb/{title[:35]}")
|
31 |
+
# print(f"{title} stored!")
|
32 |
+
# return True
|
33 |
+
|
34 |
+
##* Load vectorstore ##
|
35 |
+
def load_vectorstore(name, embeddings):
|
36 |
+
vector_store = FAISS.load_local(f"./vectordb/{name}/", embeddings=embeddings, allow_dangerous_deserialization=True)
|
37 |
+
return vector_store
|
38 |
+
|
39 |
+
def main():
|
40 |
+
st.header('Chat with PDF 💬')
|
41 |
+
|
42 |
+
# pdf = st.file_uploader("Upload PDF", type='pdf')
|
43 |
+
|
44 |
+
|
45 |
+
embeddings = OpenAIEmbeddings()
|
46 |
+
vector_store = ''
|
47 |
+
|
48 |
+
if 'clicked' not in st.session_state:
|
49 |
+
st.session_state.clicked = False
|
50 |
+
|
51 |
+
def click_button():
|
52 |
+
st.session_state.clicked = True
|
53 |
+
|
54 |
+
# st.button('Load', on_click=click_button)
|
55 |
+
with st.container(border=True):
|
56 |
+
st.markdown('''
|
57 |
+
*Disclaimer: Section for uploading the PDF file has been removed as the API calls for OpenAI are not free. I've included few pdfs for Q&A. You can access the source code and enable the section for uploading PDFs.*
|
58 |
+
''')
|
59 |
+
|
60 |
+
##! Converting text to word Embeddings ##
|
61 |
+
# if st.session_state.clicked:
|
62 |
+
# if pdf is not None:
|
63 |
+
# text = read_pdf(pdf)
|
64 |
+
# text_splitter = RecursiveCharacterTextSplitter(
|
65 |
+
# chunk_size=1000,
|
66 |
+
# chunk_overlap=200,
|
67 |
+
# length_function=len
|
68 |
+
# )
|
69 |
+
# chunks = text_splitter.split_text(text=text)
|
70 |
+
# vector_store = FAISS.from_texts(chunks, embedding=embeddings)
|
71 |
+
# if save_btn:
|
72 |
+
# save_vectorstore(pdf.name[:-4], vector_store)
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
option = st.selectbox(label="Select the PDF: ", options=['Budget Speech 2024', 'The 100 Page Machine Learning Book'], index=None)
|
77 |
+
|
78 |
+
query = st.text_input("Ask questions from your PDF file:")
|
79 |
+
|
80 |
+
if query and not option:
|
81 |
+
st.warning("Please Select a PDF")
|
82 |
+
|
83 |
+
if option:
|
84 |
+
title = re.sub('\s+', '-', option)
|
85 |
+
title = re.sub('[^a-zA-Z0-9\-]+', '', title)
|
86 |
+
vector_store = load_vectorstore(title, embeddings)
|
87 |
+
|
88 |
+
if query:
|
89 |
+
docs = vector_store.similarity_search(query=query, k=3)
|
90 |
+
llm = OpenAI(temperature=0)
|
91 |
+
chain = load_qa_chain(llm=llm, chain_type="stuff")
|
92 |
+
with get_openai_callback() as cb:
|
93 |
+
response = chain.invoke({"input_documents": docs, "question":query})
|
94 |
+
print(cb)
|
95 |
+
st.write(response["output_text"])
|
96 |
+
|
97 |
+
|
98 |
+
if __name__ == '__main__':
|
99 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
PyPDF2
|
3 |
+
langchain
|
4 |
+
langchain_community
|
5 |
+
openai
|
6 |
+
tiktoken
|
7 |
+
faiss-cpu
|
8 |
+
python-dotenv
|
vectordb/Budget-Speech-2024/index.faiss
ADDED
Binary file (277 kB). View file
|
|
vectordb/Budget-Speech-2024/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f26f0949132ab2b73881fa767ffdb7943c53b38c1a3f7c8ab548b1d7e7e77280
|
3 |
+
size 48844
|
vectordb/The-100-Page-Machine-Learning-Book/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b6fecddc1fa588c31bdb8a707349a831503e609d89d5b0ade3a9007b03d56f3f
|
3 |
+
size 2150445
|
vectordb/The-100-Page-Machine-Learning-Book/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8ce1dc384be60be0ce9b6489f2892c15e734d73c43b9b5b6878080611a39459f
|
3 |
+
size 379563
|
vectordb/test/index.faiss
ADDED
Binary file (135 kB). View file
|
|
vectordb/test/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de9e7792ae26942b5b47586eac60a9e8339b598aacf5c404e15b6a03c5deac6b
|
3 |
+
size 23798
|