Spaces:

Somnath3570
/

FAQ_document

Sleeping

File size: 7,524 Bytes

eb2a41f

import os
import streamlit as st
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEndpoint
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load environment variables
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

# Constants
DATA_PATH = "data/"
DB_FAISS_PATH = "vectorstore/db_faiss"
HUGGINGFACE_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
HF_TOKEN = os.environ.get("HF_TOKEN")

# Custom prompt template
CUSTOM_PROMPT_TEMPLATE = """
Use the pieces of information provided in the context to answer user's question.
If you dont know the answer, just say that you dont know, dont try to make up an answer.

Dont provide anything out of the given context

Context: {context}
Question: {question}

Start the answer directly. No small talk please.
"""

def load_pdf_files(data_path):
    try:
        loader = DirectoryLoader(data_path,
                                glob='*.pdf',
                                loader_cls=PyPDFLoader)
        documents = loader.load()
        return documents
    except Exception as e:
        st.error(f"Error loading PDF files: {e}")
        return []

def create_chunks(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                                chunk_overlap=50)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

def get_embedding_model():
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding_model

def create_vectorstore():
    if not os.path.exists(DATA_PATH):
        os.makedirs(DATA_PATH)
        st.warning(f"Created empty data directory at {DATA_PATH}. Please upload PDF files.")
        return None
    
    documents = load_pdf_files(data=DATA_PATH)
    if not documents:
        st.warning("No PDF files found in data directory. Please upload some PDFs.")
        return None
    
    st.info(f"Loaded {len(documents)} PDF pages")
    text_chunks = create_chunks(extracted_data=documents)
    st.info(f"Created {len(text_chunks)} text chunks")
    
    embedding_model = get_embedding_model()
    
    if not os.path.exists(os.path.dirname(DB_FAISS_PATH)):
        os.makedirs(os.path.dirname(DB_FAISS_PATH))
    
    db = FAISS.from_documents(text_chunks, embedding_model)
    db.save_local(DB_FAISS_PATH)
    st.success(f"Created vector store at {DB_FAISS_PATH}")
    return db

@st.cache_resource
def get_vectorstore():
    if os.path.exists(DB_FAISS_PATH):
        embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
        try:
            db = FAISS.load_local(DB_FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)
            return db
        except Exception as e:
            st.error(f"Error loading vector store: {e}")
            return None
    else:
        st.warning("Vector store not found. Please create it first.")
        return None

def set_custom_prompt():
    prompt = PromptTemplate(template=CUSTOM_PROMPT_TEMPLATE, input_variables=["context", "question"])
    return prompt

def load_llm():
    if not HF_TOKEN:
        st.error("HF_TOKEN not found. Please set it in your environment variables.")
        return None
    
    try:
        llm = HuggingFaceEndpoint(
            repo_id=HUGGINGFACE_REPO_ID,
            task="text-generation",
            temperature=0.5,
            model_kwargs={
                "token": HF_TOKEN,
                "max_length": 512
            }
        )
        return llm
    except Exception as e:
        st.error(f"Error loading LLM: {e}")
        return None

def upload_pdf():
    uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
    if uploaded_files:
        for uploaded_file in uploaded_files:
            with open(os.path.join(DATA_PATH, uploaded_file.name), "wb") as f:
                f.write(uploaded_file.getbuffer())
        st.success(f"Uploaded {len(uploaded_files)} files to {DATA_PATH}")
        return True
    return False

def main():
    st.title("PDF Question Answering System")
    
    # Sidebar
    st.sidebar.title("Settings")
    page = st.sidebar.radio("Choose an action", ["Upload PDFs", "Create Vector Store", "Chat with Documents"])
    
    if page == "Upload PDFs":
        st.header("Upload PDF Files")
        st.info("Upload PDF files that will be used for question answering")
        if upload_pdf():
            st.info("Now go to 'Create Vector Store' to process your documents")
    
    elif page == "Create Vector Store":
        st.header("Create Vector Store")
        st.info("This will process your PDF files and create embeddings")
        if st.button("Create Vector Store"):
            with st.spinner("Processing documents..."):
                create_vectorstore()
    
    elif page == "Chat with Documents":
        st.header("Ask Questions About Your Documents")
        
        if 'messages' not in st.session_state:
            st.session_state.messages = []
        
        for message in st.session_state.messages:
            st.chat_message(message['role']).markdown(message['content'])
        
        prompt = st.chat_input("Ask a question about your documents")
        
        if prompt:
            st.chat_message('user').markdown(prompt)
            st.session_state.messages.append({'role': 'user', 'content': prompt})
            
            vectorstore = get_vectorstore()
            if vectorstore is None:
                st.error("Vector store not available. Please create it first.")
                return
            
            llm = load_llm()
            if llm is None:
                return
            
            try:
                with st.spinner("Thinking..."):
                    qa_chain = RetrievalQA.from_chain_type(
                        llm=llm,
                        chain_type="stuff",
                        retriever=vectorstore.as_retriever(search_kwargs={'k': 3}),
                        return_source_documents=True,
                        chain_type_kwargs={'prompt': set_custom_prompt()}
                    )
                    
                    response = qa_chain.invoke({'query': prompt})
                    
                    result = response["result"]
                    source_documents = response["source_documents"]
                    
                    # Format source documents more cleanly
                    source_docs_text = "\n\n**Source Documents:**\n"
                    for i, doc in enumerate(source_documents, 1):
                        source_docs_text += f"{i}. Page {doc.metadata.get('page', 'N/A')}: {doc.page_content[:200]}...\n\n"
                    
                    result_to_show = f"{result}\n{source_docs_text}"
                    
                    st.chat_message('assistant').markdown(result_to_show)
                    st.session_state.messages.append({'role': 'assistant', 'content': result_to_show})
                    
            except Exception as e:
                st.error(f"Error: {str(e)}")
                st.error("Please check your HuggingFace token and model access permissions")

if __name__ == "__main__":
    main()