Spaces:

raseel-zymr
/

Document-QandA

Running

File size: 4,415 Bytes

eaf0e00
 
952eb35
5aee298
eaf0e00
 
 
 
 
 
 
 
 
 
 
 
 
8c5d334
 
 
eaf0e00
 
 
 
952eb35
8c5d334
 
 
 
 
 
952eb35
8c5d334
 
 
952eb35
8c5d334
 
952eb35
8c5d334
 
 
 
 
 
 
 
 
952eb35
8c5d334
 
952eb35
8c5d334
 
952eb35
8c5d334
 
952eb35
8c5d334
 
 
952eb35
8c5d334
 
 
 
 
 
5aee298
8c5d334
 
 
5aee298
8c5d334
5aee298
 
 
 
8c5d334
 
 
5aee298
8c5d334
 
eaf0e00
8c5d334
 
eaf0e00
8c5d334
eaf0e00
8c5d334
 
eaf0e00
8c5d334
 
 
eaf0e00
8c5d334
 
eaf0e00
8c5d334
 
 
 
 
eaf0e00

import os
import streamlit as st
from pathlib import Path
from io import StringIO

#for textfiles
from langchain.document_loaders import TextLoader
#text splitter
from langchain.text_splitter import CharacterTextSplitter
#for using HugginFace models & embeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFaceHub
# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
from langchain.vectorstores import FAISS
#facebook vectorization
from langchain.chains.question_answering import load_qa_chain
#load pdf
#vectorize db index with chromadb
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredPDFLoader

os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"]


def init():
	global embeddings, llm, llm2, chain
	# Embeddings
	embeddings = HuggingFaceEmbeddings()
	llm=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
	chain = load_qa_chain(llm, chain_type="stuff")

def pdf_file(txtFileObj):
	st.subheader('Uploaded PDF File:')
	st.write(txtFileObj.name)

	with open(txtFileObj.name, "wb") as f:
  		f.write(txtFileObj.getbuffer())

	loaders = [UnstructuredPDFLoader(txtFileObj.name)]
	index = VectorstoreIndexCreator(
    		embedding=embeddings,
    		text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)
	
	chain = RetrievalQA.from_chain_type(llm=llm,
				     chain_type="stuff",
					 retriever=index.vectorstore.as_retriever(),
					 input_key="question")

	st.subheader('Enter query')
	query = st.text_input('Ask anything about the Document you uploaded')

	if (query):
		answer = chain.run(question=query)

		st.subheader('Answer')
		st.write(answer)

def text_file(txtFileObj):
	st.subheader('Uploaded Text File:')
	st.write(txtFileObj.name)

	#stringio = StringIO(txtFileObj.getvalue().decode("utf-8"))
	with open(txtFileObj.name, "wb") as f:
  		f.write(txtFileObj.getbuffer())	
	
	loader = TextLoader(txtFileObj.name)
	documents = loader.load()

	# Text Splitter
	text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
	docs = text_splitter.split_documents(documents)

	db = FAISS.from_documents(docs, embeddings)

	st.subheader('Enter query')
	query = st.text_input('Ask anything about the Document you uploaded')

	if (query):
		docs = db.similarity_search(query)
		answer = chain.run(input_documents=docs, question=query)

		st.subheader('Answer')
		st.write(answer)

st.title('Document Q&A - Ask anything in your Document')
st.subheader('This application can be used to upload text(.txt) and PDF(.pdf) files and ask questions about their contents.')

init()

st.sidebar.subheader('Upload document')
uploaded_file = st.sidebar.file_uploader("Upload File",type=['txt','pdf'])

if uploaded_file and Path(uploaded_file.name).suffix == '.txt':
	st.sidebar.info(Path(uploaded_file.name))
	text_file(uploaded_file)

if uploaded_file and Path(uploaded_file.name).suffix == '.pdf':
	pdf_file(uploaded_file)

with st.sidebar.expander('File'):
    if (uploaded_file):
	    st.info(uploaded_file.name)
if os.path.exists('/content/'):
	st.info(os.listdir('/content/'))


# # PDFs
# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/PLC_mediumArticle.pdf
# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/BridgingTheGaap_fromMedium.pdf
# # !mkdir pdfs
# # !cp *pdf '/content/pdfs'

# # pdf_folder_path = '/content/pdfs'
# # os.listdir(pdf_folder_path)

# # loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
# # loaders

# index = VectorstoreIndexCreator(
#     embedding=HuggingFaceEmbeddings(),
#     text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)

# #Load llm with selected one
# llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
# #Prepare the pipeline
# from langchain.chains import RetrievalQA
# chain = RetrievalQA.from_chain_type(llm=llm2,
#                                     chain_type="stuff",
#                                     retriever=index.vectorstore.as_retriever(),
#                                     input_key="question")
# #get reply to our questions
# # chain.run('What is the difference between a PLC and a PC?')