Spaces:

Somnath3570
/

FAQ_document

Sleeping

App Files Files Community

FAQ_document / app.py

Somnath3570

Create app.py

eb2a41f verified 5 months ago

raw

history blame

7.52 kB

	import os
	import streamlit as st
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain.chains import RetrievalQA
	from langchain_community.vectorstores import FAISS
	from langchain_core.prompts import PromptTemplate
	from langchain_huggingface import HuggingFaceEndpoint
	from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	# Load environment variables
	from dotenv import load_dotenv, find_dotenv
	load_dotenv(find_dotenv())

	# Constants
	DATA_PATH = "data/"
	DB_FAISS_PATH = "vectorstore/db_faiss"
	HUGGINGFACE_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
	HF_TOKEN = os.environ.get("HF_TOKEN")

	# Custom prompt template
	CUSTOM_PROMPT_TEMPLATE = """
	Use the pieces of information provided in the context to answer user's question.
	If you dont know the answer, just say that you dont know, dont try to make up an answer.

	Dont provide anything out of the given context

	Context: {context}
	Question: {question}

	Start the answer directly. No small talk please.
	"""

	def load_pdf_files(data_path):
	try:
	loader = DirectoryLoader(data_path,
	glob='*.pdf',
	loader_cls=PyPDFLoader)
	documents = loader.load()
	return documents
	except Exception as e:
	st.error(f"Error loading PDF files: {e}")
	return []

	def create_chunks(extracted_data):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
	chunk_overlap=50)
	text_chunks = text_splitter.split_documents(extracted_data)
	return text_chunks

	def get_embedding_model():
	embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	return embedding_model

	def create_vectorstore():
	if not os.path.exists(DATA_PATH):
	os.makedirs(DATA_PATH)
	st.warning(f"Created empty data directory at {DATA_PATH}. Please upload PDF files.")
	return None

	documents = load_pdf_files(data=DATA_PATH)
	if not documents:
	st.warning("No PDF files found in data directory. Please upload some PDFs.")
	return None

	st.info(f"Loaded {len(documents)} PDF pages")
	text_chunks = create_chunks(extracted_data=documents)
	st.info(f"Created {len(text_chunks)} text chunks")

	embedding_model = get_embedding_model()

	if not os.path.exists(os.path.dirname(DB_FAISS_PATH)):
	os.makedirs(os.path.dirname(DB_FAISS_PATH))

	db = FAISS.from_documents(text_chunks, embedding_model)
	db.save_local(DB_FAISS_PATH)
	st.success(f"Created vector store at {DB_FAISS_PATH}")
	return db

	@st.cache_resource
	def get_vectorstore():
	if os.path.exists(DB_FAISS_PATH):
	embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
	try:
	db = FAISS.load_local(DB_FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)
	return db
	except Exception as e:
	st.error(f"Error loading vector store: {e}")
	return None
	else:
	st.warning("Vector store not found. Please create it first.")
	return None

	def set_custom_prompt():
	prompt = PromptTemplate(template=CUSTOM_PROMPT_TEMPLATE, input_variables=["context", "question"])
	return prompt

	def load_llm():
	if not HF_TOKEN:
	st.error("HF_TOKEN not found. Please set it in your environment variables.")
	return None

	try:
	llm = HuggingFaceEndpoint(
	repo_id=HUGGINGFACE_REPO_ID,
	task="text-generation",
	temperature=0.5,
	model_kwargs={
	"token": HF_TOKEN,
	"max_length": 512
	}
	)
	return llm
	except Exception as e:
	st.error(f"Error loading LLM: {e}")
	return None

	def upload_pdf():
	uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
	if uploaded_files:
	for uploaded_file in uploaded_files:
	with open(os.path.join(DATA_PATH, uploaded_file.name), "wb") as f:
	f.write(uploaded_file.getbuffer())
	st.success(f"Uploaded {len(uploaded_files)} files to {DATA_PATH}")
	return True
	return False

	def main():
	st.title("PDF Question Answering System")

	# Sidebar
	st.sidebar.title("Settings")
	page = st.sidebar.radio("Choose an action", ["Upload PDFs", "Create Vector Store", "Chat with Documents"])

	if page == "Upload PDFs":
	st.header("Upload PDF Files")
	st.info("Upload PDF files that will be used for question answering")
	if upload_pdf():
	st.info("Now go to 'Create Vector Store' to process your documents")

	elif page == "Create Vector Store":
	st.header("Create Vector Store")
	st.info("This will process your PDF files and create embeddings")
	if st.button("Create Vector Store"):
	with st.spinner("Processing documents..."):
	create_vectorstore()

	elif page == "Chat with Documents":
	st.header("Ask Questions About Your Documents")

	if 'messages' not in st.session_state:
	st.session_state.messages = []

	for message in st.session_state.messages:
	st.chat_message(message['role']).markdown(message['content'])

	prompt = st.chat_input("Ask a question about your documents")

	if prompt:
	st.chat_message('user').markdown(prompt)
	st.session_state.messages.append({'role': 'user', 'content': prompt})

	vectorstore = get_vectorstore()
	if vectorstore is None:
	st.error("Vector store not available. Please create it first.")
	return

	llm = load_llm()
	if llm is None:
	return

	try:
	with st.spinner("Thinking..."):
	qa_chain = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type="stuff",
	retriever=vectorstore.as_retriever(search_kwargs={'k': 3}),
	return_source_documents=True,
	chain_type_kwargs={'prompt': set_custom_prompt()}
	)

	response = qa_chain.invoke({'query': prompt})

	result = response["result"]
	source_documents = response["source_documents"]

	# Format source documents more cleanly
	source_docs_text = "\n\nSource Documents:\n"
	for i, doc in enumerate(source_documents, 1):
	source_docs_text += f"{i}. Page {doc.metadata.get('page', 'N/A')}: {doc.page_content[:200]}...\n\n"

	result_to_show = f"{result}\n{source_docs_text}"

	st.chat_message('assistant').markdown(result_to_show)
	st.session_state.messages.append({'role': 'assistant', 'content': result_to_show})

	except Exception as e:
	st.error(f"Error: {str(e)}")
	st.error("Please check your HuggingFace token and model access permissions")

	if __name__ == "__main__":
	main()