Spaces:
Sleeping
Sleeping
File size: 4,415 Bytes
eaf0e00 952eb35 5aee298 eaf0e00 8c5d334 eaf0e00 952eb35 8c5d334 952eb35 8c5d334 952eb35 8c5d334 952eb35 8c5d334 952eb35 8c5d334 952eb35 8c5d334 952eb35 8c5d334 952eb35 8c5d334 952eb35 8c5d334 5aee298 8c5d334 5aee298 8c5d334 5aee298 8c5d334 5aee298 8c5d334 eaf0e00 8c5d334 eaf0e00 8c5d334 eaf0e00 8c5d334 eaf0e00 8c5d334 eaf0e00 8c5d334 eaf0e00 8c5d334 eaf0e00 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import os
import streamlit as st
from pathlib import Path
from io import StringIO
#for textfiles
from langchain.document_loaders import TextLoader
#text splitter
from langchain.text_splitter import CharacterTextSplitter
#for using HugginFace models & embeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFaceHub
# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
from langchain.vectorstores import FAISS
#facebook vectorization
from langchain.chains.question_answering import load_qa_chain
#load pdf
#vectorize db index with chromadb
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredPDFLoader
os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"]
def init():
global embeddings, llm, llm2, chain
# Embeddings
embeddings = HuggingFaceEmbeddings()
llm=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
chain = load_qa_chain(llm, chain_type="stuff")
def pdf_file(txtFileObj):
st.subheader('Uploaded PDF File:')
st.write(txtFileObj.name)
with open(txtFileObj.name, "wb") as f:
f.write(txtFileObj.getbuffer())
loaders = [UnstructuredPDFLoader(txtFileObj.name)]
index = VectorstoreIndexCreator(
embedding=embeddings,
text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)
chain = RetrievalQA.from_chain_type(llm=llm,
chain_type="stuff",
retriever=index.vectorstore.as_retriever(),
input_key="question")
st.subheader('Enter query')
query = st.text_input('Ask anything about the Document you uploaded')
if (query):
answer = chain.run(question=query)
st.subheader('Answer')
st.write(answer)
def text_file(txtFileObj):
st.subheader('Uploaded Text File:')
st.write(txtFileObj.name)
#stringio = StringIO(txtFileObj.getvalue().decode("utf-8"))
with open(txtFileObj.name, "wb") as f:
f.write(txtFileObj.getbuffer())
loader = TextLoader(txtFileObj.name)
documents = loader.load()
# Text Splitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
docs = text_splitter.split_documents(documents)
db = FAISS.from_documents(docs, embeddings)
st.subheader('Enter query')
query = st.text_input('Ask anything about the Document you uploaded')
if (query):
docs = db.similarity_search(query)
answer = chain.run(input_documents=docs, question=query)
st.subheader('Answer')
st.write(answer)
st.title('Document Q&A - Ask anything in your Document')
st.subheader('This application can be used to upload text(.txt) and PDF(.pdf) files and ask questions about their contents.')
init()
st.sidebar.subheader('Upload document')
uploaded_file = st.sidebar.file_uploader("Upload File",type=['txt','pdf'])
if uploaded_file and Path(uploaded_file.name).suffix == '.txt':
st.sidebar.info(Path(uploaded_file.name))
text_file(uploaded_file)
if uploaded_file and Path(uploaded_file.name).suffix == '.pdf':
pdf_file(uploaded_file)
with st.sidebar.expander('File'):
if (uploaded_file):
st.info(uploaded_file.name)
if os.path.exists('/content/'):
st.info(os.listdir('/content/'))
# # PDFs
# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/PLC_mediumArticle.pdf
# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/BridgingTheGaap_fromMedium.pdf
# # !mkdir pdfs
# # !cp *pdf '/content/pdfs'
# # pdf_folder_path = '/content/pdfs'
# # os.listdir(pdf_folder_path)
# # loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
# # loaders
# index = VectorstoreIndexCreator(
# embedding=HuggingFaceEmbeddings(),
# text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)
# #Load llm with selected one
# llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
# #Prepare the pipeline
# from langchain.chains import RetrievalQA
# chain = RetrievalQA.from_chain_type(llm=llm2,
# chain_type="stuff",
# retriever=index.vectorstore.as_retriever(),
# input_key="question")
# #get reply to our questions
# # chain.run('What is the difference between a PLC and a PC?') |