Spaces:

Somnath3570
/

FAQ_document

Sleeping

App Files Files Community

Somnath3570 commited on Feb 18

Commit

eb2a41f

verified ·

1 Parent(s): 8fea7fb

Create app.py

Browse files

Files changed (1) hide show

app.py +202 -0

app.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import os
+import streamlit as st
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.chains import RetrievalQA
+from langchain_community.vectorstores import FAISS
+from langchain_core.prompts import PromptTemplate
+from langchain_huggingface import HuggingFaceEndpoint
+from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+# Load environment variables
+from dotenv import load_dotenv, find_dotenv
+load_dotenv(find_dotenv())
+# Constants
+DATA_PATH = "data/"
+DB_FAISS_PATH = "vectorstore/db_faiss"
+HUGGINGFACE_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
+HF_TOKEN = os.environ.get("HF_TOKEN")
+# Custom prompt template
+CUSTOM_PROMPT_TEMPLATE = """
+Use the pieces of information provided in the context to answer user's question.
+If you dont know the answer, just say that you dont know, dont try to make up an answer.
+Dont provide anything out of the given context
+Context: {context}
+Question: {question}
+Start the answer directly. No small talk please.
+"""
+def load_pdf_files(data_path):
+    try:
+        loader = DirectoryLoader(data_path,
+                                glob='*.pdf',
+                                loader_cls=PyPDFLoader)
+        documents = loader.load()
+        return documents
+    except Exception as e:
+        st.error(f"Error loading PDF files: {e}")
+        return []
+def create_chunks(extracted_data):
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
+                                                chunk_overlap=50)
+    text_chunks = text_splitter.split_documents(extracted_data)
+    return text_chunks
+def get_embedding_model():
+    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    return embedding_model
+def create_vectorstore():
+    if not os.path.exists(DATA_PATH):
+        os.makedirs(DATA_PATH)
+        st.warning(f"Created empty data directory at {DATA_PATH}. Please upload PDF files.")
+        return None
+    documents = load_pdf_files(data=DATA_PATH)
+    if not documents:
+        st.warning("No PDF files found in data directory. Please upload some PDFs.")
+        return None
+    st.info(f"Loaded {len(documents)} PDF pages")
+    text_chunks = create_chunks(extracted_data=documents)
+    st.info(f"Created {len(text_chunks)} text chunks")
+    embedding_model = get_embedding_model()
+    if not os.path.exists(os.path.dirname(DB_FAISS_PATH)):
+        os.makedirs(os.path.dirname(DB_FAISS_PATH))
+    db = FAISS.from_documents(text_chunks, embedding_model)
+    db.save_local(DB_FAISS_PATH)
+    st.success(f"Created vector store at {DB_FAISS_PATH}")
+    return db
+@st.cache_resource
+def get_vectorstore():
+    if os.path.exists(DB_FAISS_PATH):
+        embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
+        try:
+            db = FAISS.load_local(DB_FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)
+            return db
+        except Exception as e:
+            st.error(f"Error loading vector store: {e}")
+            return None
+    else:
+        st.warning("Vector store not found. Please create it first.")
+        return None
+def set_custom_prompt():
+    prompt = PromptTemplate(template=CUSTOM_PROMPT_TEMPLATE, input_variables=["context", "question"])
+    return prompt
+def load_llm():
+    if not HF_TOKEN:
+        st.error("HF_TOKEN not found. Please set it in your environment variables.")
+        return None
+    try:
+        llm = HuggingFaceEndpoint(
+            repo_id=HUGGINGFACE_REPO_ID,
+            task="text-generation",
+            temperature=0.5,
+            model_kwargs={
+                "token": HF_TOKEN,
+                "max_length": 512
+            }
+        )
+        return llm
+    except Exception as e:
+        st.error(f"Error loading LLM: {e}")
+        return None
+def upload_pdf():
+    uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
+    if uploaded_files:
+        for uploaded_file in uploaded_files:
+            with open(os.path.join(DATA_PATH, uploaded_file.name), "wb") as f:
+                f.write(uploaded_file.getbuffer())
+        st.success(f"Uploaded {len(uploaded_files)} files to {DATA_PATH}")
+        return True
+    return False
+def main():
+    st.title("PDF Question Answering System")
+    # Sidebar
+    st.sidebar.title("Settings")
+    page = st.sidebar.radio("Choose an action", ["Upload PDFs", "Create Vector Store", "Chat with Documents"])
+    if page == "Upload PDFs":
+        st.header("Upload PDF Files")
+        st.info("Upload PDF files that will be used for question answering")
+        if upload_pdf():
+            st.info("Now go to 'Create Vector Store' to process your documents")
+    elif page == "Create Vector Store":
+        st.header("Create Vector Store")
+        st.info("This will process your PDF files and create embeddings")
+        if st.button("Create Vector Store"):
+            with st.spinner("Processing documents..."):
+                create_vectorstore()
+    elif page == "Chat with Documents":
+        st.header("Ask Questions About Your Documents")
+        if 'messages' not in st.session_state:
+            st.session_state.messages = []
+        for message in st.session_state.messages:
+            st.chat_message(message['role']).markdown(message['content'])
+        prompt = st.chat_input("Ask a question about your documents")
+        if prompt:
+            st.chat_message('user').markdown(prompt)
+            st.session_state.messages.append({'role': 'user', 'content': prompt})
+            vectorstore = get_vectorstore()
+            if vectorstore is None:
+                st.error("Vector store not available. Please create it first.")
+                return
+            llm = load_llm()
+            if llm is None:
+                return
+            try:
+                with st.spinner("Thinking..."):
+                    qa_chain = RetrievalQA.from_chain_type(
+                        llm=llm,
+                        chain_type="stuff",
+                        retriever=vectorstore.as_retriever(search_kwargs={'k': 3}),
+                        return_source_documents=True,
+                        chain_type_kwargs={'prompt': set_custom_prompt()}
+                    )
+                    response = qa_chain.invoke({'query': prompt})
+                    result = response["result"]
+                    source_documents = response["source_documents"]
+                    # Format source documents more cleanly
+                    source_docs_text = "\n\n**Source Documents:**\n"
+                    for i, doc in enumerate(source_documents, 1):
+                        source_docs_text += f"{i}. Page {doc.metadata.get('page', 'N/A')}: {doc.page_content[:200]}...\n\n"
+                    result_to_show = f"{result}\n{source_docs_text}"
+                    st.chat_message('assistant').markdown(result_to_show)
+                    st.session_state.messages.append({'role': 'assistant', 'content': result_to_show})
+            except Exception as e:
+                st.error(f"Error: {str(e)}")
+                st.error("Please check your HuggingFace token and model access permissions")
+if __name__ == "__main__":
+    main()