import os import streamlit as st from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.chains import RetrievalQA from langchain_community.vectorstores import FAISS from langchain_core.prompts import PromptTemplate from langchain_huggingface import HuggingFaceEndpoint from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter # Load environment variables from dotenv import load_dotenv, find_dotenv load_dotenv(find_dotenv()) # Constants DATA_PATH = "data/" DB_FAISS_PATH = "vectorstore/db_faiss" HUGGINGFACE_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3" HF_TOKEN = os.environ.get("HF_TOKEN") # Custom prompt template CUSTOM_PROMPT_TEMPLATE = """ Use the pieces of information provided in the context to answer user's question. If you dont know the answer, just say that you dont know, dont try to make up an answer. Dont provide anything out of the given context Context: {context} Question: {question} Start the answer directly. No small talk please. """ def load_pdf_files(data_path): try: loader = DirectoryLoader(data_path, glob='*.pdf', loader_cls=PyPDFLoader) documents = loader.load() return documents except Exception as e: st.error(f"Error loading PDF files: {e}") return [] def create_chunks(extracted_data): text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) text_chunks = text_splitter.split_documents(extracted_data) return text_chunks def get_embedding_model(): embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") return embedding_model def create_vectorstore(): if not os.path.exists(DATA_PATH): os.makedirs(DATA_PATH) st.warning(f"Created empty data directory at {DATA_PATH}. Please upload PDF files.") return None documents = load_pdf_files(data=DATA_PATH) if not documents: st.warning("No PDF files found in data directory. Please upload some PDFs.") return None st.info(f"Loaded {len(documents)} PDF pages") text_chunks = create_chunks(extracted_data=documents) st.info(f"Created {len(text_chunks)} text chunks") embedding_model = get_embedding_model() if not os.path.exists(os.path.dirname(DB_FAISS_PATH)): os.makedirs(os.path.dirname(DB_FAISS_PATH)) db = FAISS.from_documents(text_chunks, embedding_model) db.save_local(DB_FAISS_PATH) st.success(f"Created vector store at {DB_FAISS_PATH}") return db @st.cache_resource def get_vectorstore(): if os.path.exists(DB_FAISS_PATH): embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') try: db = FAISS.load_local(DB_FAISS_PATH, embedding_model, allow_dangerous_deserialization=True) return db except Exception as e: st.error(f"Error loading vector store: {e}") return None else: st.warning("Vector store not found. Please create it first.") return None def set_custom_prompt(): prompt = PromptTemplate(template=CUSTOM_PROMPT_TEMPLATE, input_variables=["context", "question"]) return prompt def load_llm(): if not HF_TOKEN: st.error("HF_TOKEN not found. Please set it in your environment variables.") return None try: llm = HuggingFaceEndpoint( repo_id=HUGGINGFACE_REPO_ID, task="text-generation", temperature=0.5, model_kwargs={ "token": HF_TOKEN, "max_length": 512 } ) return llm except Exception as e: st.error(f"Error loading LLM: {e}") return None def upload_pdf(): uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True) if uploaded_files: for uploaded_file in uploaded_files: with open(os.path.join(DATA_PATH, uploaded_file.name), "wb") as f: f.write(uploaded_file.getbuffer()) st.success(f"Uploaded {len(uploaded_files)} files to {DATA_PATH}") return True return False def main(): st.title("PDF Question Answering System") # Sidebar st.sidebar.title("Settings") page = st.sidebar.radio("Choose an action", ["Upload PDFs", "Create Vector Store", "Chat with Documents"]) if page == "Upload PDFs": st.header("Upload PDF Files") st.info("Upload PDF files that will be used for question answering") if upload_pdf(): st.info("Now go to 'Create Vector Store' to process your documents") elif page == "Create Vector Store": st.header("Create Vector Store") st.info("This will process your PDF files and create embeddings") if st.button("Create Vector Store"): with st.spinner("Processing documents..."): create_vectorstore() elif page == "Chat with Documents": st.header("Ask Questions About Your Documents") if 'messages' not in st.session_state: st.session_state.messages = [] for message in st.session_state.messages: st.chat_message(message['role']).markdown(message['content']) prompt = st.chat_input("Ask a question about your documents") if prompt: st.chat_message('user').markdown(prompt) st.session_state.messages.append({'role': 'user', 'content': prompt}) vectorstore = get_vectorstore() if vectorstore is None: st.error("Vector store not available. Please create it first.") return llm = load_llm() if llm is None: return try: with st.spinner("Thinking..."): qa_chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever(search_kwargs={'k': 3}), return_source_documents=True, chain_type_kwargs={'prompt': set_custom_prompt()} ) response = qa_chain.invoke({'query': prompt}) result = response["result"] source_documents = response["source_documents"] # Format source documents more cleanly source_docs_text = "\n\n**Source Documents:**\n" for i, doc in enumerate(source_documents, 1): source_docs_text += f"{i}. Page {doc.metadata.get('page', 'N/A')}: {doc.page_content[:200]}...\n\n" result_to_show = f"{result}\n{source_docs_text}" st.chat_message('assistant').markdown(result_to_show) st.session_state.messages.append({'role': 'assistant', 'content': result_to_show}) except Exception as e: st.error(f"Error: {str(e)}") st.error("Please check your HuggingFace token and model access permissions") if __name__ == "__main__": main()