File size: 4,415 Bytes
eaf0e00
 
952eb35
5aee298
eaf0e00
 
 
 
 
 
 
 
 
 
 
 
 
8c5d334
 
 
eaf0e00
 
 
 
952eb35
8c5d334
 
 
 
 
 
952eb35
8c5d334
 
 
952eb35
8c5d334
 
952eb35
8c5d334
 
 
 
 
 
 
 
 
952eb35
8c5d334
 
952eb35
8c5d334
 
952eb35
8c5d334
 
952eb35
8c5d334
 
 
952eb35
8c5d334
 
 
 
 
 
5aee298
8c5d334
 
 
5aee298
8c5d334
5aee298
 
 
 
8c5d334
 
 
5aee298
8c5d334
 
eaf0e00
8c5d334
 
eaf0e00
8c5d334
eaf0e00
8c5d334
 
eaf0e00
8c5d334
 
 
eaf0e00
8c5d334
 
eaf0e00
8c5d334
 
 
 
 
eaf0e00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import streamlit as st
from pathlib import Path
from io import StringIO

#for textfiles
from langchain.document_loaders import TextLoader
#text splitter
from langchain.text_splitter import CharacterTextSplitter
#for using HugginFace models & embeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFaceHub
# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
from langchain.vectorstores import FAISS
#facebook vectorization
from langchain.chains.question_answering import load_qa_chain
#load pdf
#vectorize db index with chromadb
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredPDFLoader

os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"]


def init():
	global embeddings, llm, llm2, chain
	# Embeddings
	embeddings = HuggingFaceEmbeddings()
	llm=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
	chain = load_qa_chain(llm, chain_type="stuff")

def pdf_file(txtFileObj):
	st.subheader('Uploaded PDF File:')
	st.write(txtFileObj.name)

	with open(txtFileObj.name, "wb") as f:
  		f.write(txtFileObj.getbuffer())

	loaders = [UnstructuredPDFLoader(txtFileObj.name)]
	index = VectorstoreIndexCreator(
    		embedding=embeddings,
    		text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)
	
	chain = RetrievalQA.from_chain_type(llm=llm,
				     chain_type="stuff",
					 retriever=index.vectorstore.as_retriever(),
					 input_key="question")

	st.subheader('Enter query')
	query = st.text_input('Ask anything about the Document you uploaded')

	if (query):
		answer = chain.run(question=query)

		st.subheader('Answer')
		st.write(answer)

def text_file(txtFileObj):
	st.subheader('Uploaded Text File:')
	st.write(txtFileObj.name)

	#stringio = StringIO(txtFileObj.getvalue().decode("utf-8"))
	with open(txtFileObj.name, "wb") as f:
  		f.write(txtFileObj.getbuffer())	
	
	loader = TextLoader(txtFileObj.name)
	documents = loader.load()

	# Text Splitter
	text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
	docs = text_splitter.split_documents(documents)

	db = FAISS.from_documents(docs, embeddings)

	st.subheader('Enter query')
	query = st.text_input('Ask anything about the Document you uploaded')

	if (query):
		docs = db.similarity_search(query)
		answer = chain.run(input_documents=docs, question=query)

		st.subheader('Answer')
		st.write(answer)

st.title('Document Q&A - Ask anything in your Document')
st.subheader('This application can be used to upload text(.txt) and PDF(.pdf) files and ask questions about their contents.')

init()

st.sidebar.subheader('Upload document')
uploaded_file = st.sidebar.file_uploader("Upload File",type=['txt','pdf'])

if uploaded_file and Path(uploaded_file.name).suffix == '.txt':
	st.sidebar.info(Path(uploaded_file.name))
	text_file(uploaded_file)

if uploaded_file and Path(uploaded_file.name).suffix == '.pdf':
	pdf_file(uploaded_file)

with st.sidebar.expander('File'):
    if (uploaded_file):
	    st.info(uploaded_file.name)
if os.path.exists('/content/'):
	st.info(os.listdir('/content/'))


# # PDFs
# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/PLC_mediumArticle.pdf
# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/BridgingTheGaap_fromMedium.pdf
# # !mkdir pdfs
# # !cp *pdf '/content/pdfs'

# # pdf_folder_path = '/content/pdfs'
# # os.listdir(pdf_folder_path)

# # loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
# # loaders

# index = VectorstoreIndexCreator(
#     embedding=HuggingFaceEmbeddings(),
#     text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)

# #Load llm with selected one
# llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
# #Prepare the pipeline
# from langchain.chains import RetrievalQA
# chain = RetrievalQA.from_chain_type(llm=llm2,
#                                     chain_type="stuff",
#                                     retriever=index.vectorstore.as_retriever(),
#                                     input_key="question")
# #get reply to our questions
# # chain.run('What is the difference between a PLC and a PC?')