Spaces:

raseel-zymr
/

Document-QandA

Running

App Files Files Community

Document-QandA / app.py

raseel-zymr

Initial commit with streamlit

eaf0e00 about 2 years ago

raw

history blame

3.53 kB

	import os
	import streamlit as st

	#for textfiles
	from langchain.document_loaders import TextLoader
	#text splitter
	from langchain.text_splitter import CharacterTextSplitter
	#for using HugginFace models & embeddings
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain import HuggingFaceHub
	# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
	from langchain.vectorstores import FAISS
	#facebook vectorization
	from langchain.chains.question_answering import load_qa_chain
	#load pdf
	from langchain.document_loaders import UnstructuredPDFLoader

	os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"]

	st.title('Document Q&A - Ask anything in your Document')
	st.sidebar.subheader('Upload document')
	uploaded_file = st.file_uploader("Upload File",type=['txt','pdf'])
	# url2 = "https://github.com/fabiomatricardi/cdQnA/raw/main/KS-all-info_rev1.txt"
	# res = requests.get(url2)
	# with open("KS-all-info_rev1.txt", "w") as f:
	# f.write(res.text)

	st.subheader('Enter query')
	query = st.text_input('Ask anything about the Document you uploaded')

	st.subheader('Answer')
	st.write('Answer from document')

	# # Document Loader
	# loader = TextLoader('./KS-all-info_rev1.txt')
	# documents = loader.load()
	# import textwrap
	# def wrap_text_preserve_newlines(text, width=110):
	# # Split the input text into lines based on newline characters
	# lines = text.split('\n')
	# # Wrap each line individually
	# wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
	# # Join the wrapped lines back together using newline characters
	# wrapped_text = '\n'.join(wrapped_lines)
	# return wrapped_text

	# # Text Splitter
	# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
	# docs = text_splitter.split_documents(documents)

	# # Embeddings
	# embeddings = HuggingFaceEmbeddings()

	# #Create the vectorized db
	# db = FAISS.from_documents(docs, embeddings)

	# llm=HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":0, "max_length":512})
	# llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
	# chain = load_qa_chain(llm2, chain_type="stuff")

	# # Sample question
	# # query = "What the actual issues and drawbacks ?"

	# # docs = db.similarity_search(query)
	# # chain.run(input_documents=docs, question=query)


	# # PDFs
	# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/PLC_mediumArticle.pdf
	# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/BridgingTheGaap_fromMedium.pdf
	# # !mkdir pdfs
	# # !cp *pdf '/content/pdfs'

	# # pdf_folder_path = '/content/pdfs'
	# # os.listdir(pdf_folder_path)

	# # loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
	# # loaders

	# index = VectorstoreIndexCreator(
	# embedding=HuggingFaceEmbeddings(),
	# text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)

	# #Load llm with selected one
	# llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
	# #Prepare the pipeline
	# from langchain.chains import RetrievalQA
	# chain = RetrievalQA.from_chain_type(llm=llm2,
	# chain_type="stuff",
	# retriever=index.vectorstore.as_retriever(),
	# input_key="question")
	# #get reply to our questions
	# # chain.run('What is the difference between a PLC and a PC?')