Spaces:

raseel-zymr
/

Document-QandA

Sleeping

App Files Files Community

raseel-zymr commited on Jun 20, 2023

Commit

8c5d334

1 Parent(s): 952eb35

Add support for PDF files

Browse files

Files changed (1) hide show

app.py +62 -64

app.py CHANGED Viewed

@@ -15,96 +15,94 @@ from langchain.vectorstores import FAISS
 #facebook vectorization
 from langchain.chains.question_answering import load_qa_chain
 #load pdf
 from langchain.document_loaders import UnstructuredPDFLoader
 os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"]
-def pdf_file(filename):
-	st.subheader('Uploaded PDF File:')
-	st.write(filename)
-def text_file(filename):
-	st.subheader('Uploaded Text File:')
-	st.write(filename)
-	# loader = TextLoader(filename)
-	# documents = loader.load()
-	# # Text Splitter
-	# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
-	# docs = text_splitter.split_documents(documents)
-	# db = FAISS.from_documents(docs, embeddings)
-	# chain = load_qa_chain(llm2, chain_type="stuff")
-st.title('Document Q&A - Ask anything in your Document')
-st.subheader('This application can be used to upload text(.txt) and PDF(.pdf) files and ask questions about their contents.')
-st.sidebar.subheader('Upload document')
-uploaded_file = st.sidebar.file_uploader("Upload File",type=['txt','pdf'])
-if Path(uploaded_file.name).suffix == '.txt':
-	text_file(uploaded_file.name)
-if Path(uploaded_file.name).suffix == '.pdf':
-	pdf_file(uploaded_file.name)
-with st.sidebar.expander('File'):
-    if (uploaded_file):
-	    st.info(uploaded_file.name)
-if os.path.exists('/content/'):
-	st.info(os.listdir('/content/'))
-# url2 = "https://github.com/fabiomatricardi/cdQnA/raw/main/KS-all-info_rev1.txt"
-# res = requests.get(url2)
-# with open("KS-all-info_rev1.txt", "w") as f:
-#   f.write(res.text)
-if (uploaded_file):
 	st.subheader('Enter query')
 	query = st.text_input('Ask anything about the Document you uploaded')
-	stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
-	with open(uploaded_file.name, "w") as f:
-  		f.write(stringio.read())
-if(uploaded_file):
-	loader = TextLoader(uploaded_file.name)
-	documents = loader.load()
-	# import textwrap
-	# def wrap_text_preserve_newlines(text, width=110):
-    # 	# Split the input text into lines based on newline characters
-    # 	lines = text.split('\n')
-    # 	# Wrap each line individually
-    # 	wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
-    # 	# Join the wrapped lines back together using newline characters
-    # 	wrapped_text = '\n'.join(wrapped_lines)
-    # 	return wrapped_text
-	# Text Splitter
-	text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
-	docs = text_splitter.split_documents(documents)
-	# Embeddings
-	embeddings = HuggingFaceEmbeddings()
-	#Create the vectorized db
-	db = FAISS.from_documents(docs, embeddings)
-	#llm=HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":0, "max_length":512})
-	llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
-	chain = load_qa_chain(llm2, chain_type="stuff")
-	# Sample question
-	#query = "What the actual issues and drawbacks ?"
-	docs = db.similarity_search(query)
-	answer = chain.run(input_documents=docs, question=query)
-	st.subheader('Answer')
-	st.write(answer)
 # # PDFs

 #facebook vectorization
 from langchain.chains.question_answering import load_qa_chain
 #load pdf
+#vectorize db index with chromadb
+from langchain.indexes import VectorstoreIndexCreator
+from langchain.chains import RetrievalQA
 from langchain.document_loaders import UnstructuredPDFLoader
 os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"]
+def init():
+	global embeddings, llm, llm2, chain
+	# Embeddings
+	embeddings = HuggingFaceEmbeddings()
+	llm=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
+	chain = load_qa_chain(llm, chain_type="stuff")
+def pdf_file(txtFileObj):
+	st.subheader('Uploaded PDF File:')
+	st.write(txtFileObj.name)
+	with open(txtFileObj.name, "wb") as f:
+  		f.write(txtFileObj.getbuffer())
+	loaders = [UnstructuredPDFLoader(txtFileObj.name)]
+	index = VectorstoreIndexCreator(
+    		embedding=embeddings,
+    		text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)
+	chain = RetrievalQA.from_chain_type(llm=llm,
+				     chain_type="stuff",
+					 retriever=index.vectorstore.as_retriever(),
+					 input_key="question")
+	st.subheader('Enter query')
+	query = st.text_input('Ask anything about the Document you uploaded')
+	if (query):
+		answer = chain.run(question=query)
+		st.subheader('Answer')
+		st.write(answer)
+def text_file(txtFileObj):
+	st.subheader('Uploaded Text File:')
+	st.write(txtFileObj.name)
+	#stringio = StringIO(txtFileObj.getvalue().decode("utf-8"))
+	with open(txtFileObj.name, "wb") as f:
+  		f.write(txtFileObj.getbuffer())
+	loader = TextLoader(txtFileObj.name)
+	documents = loader.load()
+	# Text Splitter
+	text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
+	docs = text_splitter.split_documents(documents)
+	db = FAISS.from_documents(docs, embeddings)
 	st.subheader('Enter query')
 	query = st.text_input('Ask anything about the Document you uploaded')
+	if (query):
+		docs = db.similarity_search(query)
+		answer = chain.run(input_documents=docs, question=query)
+		st.subheader('Answer')
+		st.write(answer)
+st.title('Document Q&A - Ask anything in your Document')
+st.subheader('This application can be used to upload text(.txt) and PDF(.pdf) files and ask questions about their contents.')
+init()
+st.sidebar.subheader('Upload document')
+uploaded_file = st.sidebar.file_uploader("Upload File",type=['txt','pdf'])
+if uploaded_file and Path(uploaded_file.name).suffix == '.txt':
+	st.sidebar.info(Path(uploaded_file.name))
+	text_file(uploaded_file)
+if uploaded_file and Path(uploaded_file.name).suffix == '.pdf':
+	pdf_file(uploaded_file)
+with st.sidebar.expander('File'):
+    if (uploaded_file):
+	    st.info(uploaded_file.name)
+if os.path.exists('/content/'):
+	st.info(os.listdir('/content/'))
 # # PDFs