Spaces:

Somnath3570
/

FAQ_document

Sleeping

App Files Files Community

Somnath3570 commited on Feb 18

Commit

12f6335

verified ·

1 Parent(s): 53c6af5

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -149

app.py CHANGED Viewed

@@ -8,195 +8,156 @@ from langchain_huggingface import HuggingFaceEndpoint
 from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-# Load environment variables
-from dotenv import load_dotenv, find_dotenv
-load_dotenv(find_dotenv())
-# Constants
 DATA_PATH = "data/"
 DB_FAISS_PATH = "vectorstore/db_faiss"
-HUGGINGFACE_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
-HF_TOKEN = os.environ.get("HF_TOKEN")
-# Custom prompt template
-CUSTOM_PROMPT_TEMPLATE = """
-Use the pieces of information provided in the context to answer user's question.
-If you dont know the answer, just say that you dont know, dont try to make up an answer.
-Dont provide anything out of the given context
-Context: {context}
-Question: {question}
-Start the answer directly. No small talk please.
-"""
 def load_pdf_files(data_path):
-    try:
-        loader = DirectoryLoader(data_path,
-                                glob='*.pdf',
-                                loader_cls=PyPDFLoader)
-        documents = loader.load()
-        return documents
-    except Exception as e:
-        st.error(f"Error loading PDF files: {e}")
-        return []
 def create_chunks(extracted_data):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                                 chunk_overlap=50)
     text_chunks = text_splitter.split_documents(extracted_data)
     return text_chunks
 def get_embedding_model():
     embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     return embedding_model
-def create_vectorstore():
-    if not os.path.exists(DATA_PATH):
-        os.makedirs(DATA_PATH)
-        st.warning(f"Created empty data directory at {DATA_PATH}. Please upload PDF files.")
-        return None
-    documents = load_pdf_files(data=DATA_PATH)
-    if not documents:
-        st.warning("No PDF files found in data directory. Please upload some PDFs.")
-        return None
-    st.info(f"Loaded {len(documents)} PDF pages")
     text_chunks = create_chunks(extracted_data=documents)
     st.info(f"Created {len(text_chunks)} text chunks")
     embedding_model = get_embedding_model()
-    if not os.path.exists(os.path.dirname(DB_FAISS_PATH)):
-        os.makedirs(os.path.dirname(DB_FAISS_PATH))
     db = FAISS.from_documents(text_chunks, embedding_model)
     db.save_local(DB_FAISS_PATH)
-    st.success(f"Created vector store at {DB_FAISS_PATH}")
     return db
-@st.cache_resource
 def get_vectorstore():
     if os.path.exists(DB_FAISS_PATH):
-        embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
         try:
             db = FAISS.load_local(DB_FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)
             return db
         except Exception as e:
             st.error(f"Error loading vector store: {e}")
-            return None
     else:
-        st.warning("Vector store not found. Please create it first.")
-        return None
-def set_custom_prompt():
-    prompt = PromptTemplate(template=CUSTOM_PROMPT_TEMPLATE, input_variables=["context", "question"])
-    return prompt
-def load_llm():
-    if not HF_TOKEN:
-        st.error("HF_TOKEN not found. Please set it in your environment variables.")
-        return None
-    try:
-        llm = HuggingFaceEndpoint(
-            repo_id=HUGGINGFACE_REPO_ID,
-            task="text-generation",
-            temperature=0.5,
-            model_kwargs={
-                "token": HF_TOKEN,
-                "max_length": 512
-            }
-        )
-        return llm
-    except Exception as e:
-        st.error(f"Error loading LLM: {e}")
-        return None
-def upload_pdf():
-    uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
-    if uploaded_files:
-        for uploaded_file in uploaded_files:
-            with open(os.path.join(DATA_PATH, uploaded_file.name), "wb") as f:
-                f.write(uploaded_file.getbuffer())
-        st.success(f"Uploaded {len(uploaded_files)} files to {DATA_PATH}")
-        return True
-    return False
 def main():
-    st.title("PDF Question Answering System")
-    # Sidebar
-    st.sidebar.title("Settings")
-    page = st.sidebar.radio("Choose an action", ["Upload PDFs", "Create Vector Store", "Chat with Documents"])
-    if page == "Upload PDFs":
-        st.header("Upload PDF Files")
-        st.info("Upload PDF files that will be used for question answering")
-        if upload_pdf():
-            st.info("Now go to 'Create Vector Store' to process your documents")
-    elif page == "Create Vector Store":
-        st.header("Create Vector Store")
-        st.info("This will process your PDF files and create embeddings")
-        if st.button("Create Vector Store"):
-            with st.spinner("Processing documents..."):
-                create_vectorstore()
-    elif page == "Chat with Documents":
-        st.header("Ask Questions About Your Documents")
-        if 'messages' not in st.session_state:
-            st.session_state.messages = []
-        for message in st.session_state.messages:
-            st.chat_message(message['role']).markdown(message['content'])
-        prompt = st.chat_input("Ask a question about your documents")
-        if prompt:
-            st.chat_message('user').markdown(prompt)
-            st.session_state.messages.append({'role': 'user', 'content': prompt})
-            vectorstore = get_vectorstore()
-            if vectorstore is None:
-                st.error("Vector store not available. Please create it first.")
-                return
-            llm = load_llm()
-            if llm is None:
-                return
-            try:
-                with st.spinner("Thinking..."):
-                    qa_chain = RetrievalQA.from_chain_type(
-                        llm=llm,
-                        chain_type="stuff",
-                        retriever=vectorstore.as_retriever(search_kwargs={'k': 3}),
-                        return_source_documents=True,
-                        chain_type_kwargs={'prompt': set_custom_prompt()}
-                    )
-                    response = qa_chain.invoke({'query': prompt})
-                    result = response["result"]
-                    source_documents = response["source_documents"]
-                    # Format source documents more cleanly
-                    source_docs_text = "\n\n**Source Documents:**\n"
-                    for i, doc in enumerate(source_documents, 1):
-                        source_docs_text += f"{i}. Page {doc.metadata.get('page', 'N/A')}: {doc.page_content[:200]}...\n\n"
-                    result_to_show = f"{result}\n{source_docs_text}"
-                    st.chat_message('assistant').markdown(result_to_show)
-                    st.session_state.messages.append({'role': 'assistant', 'content': result_to_show})
-            except Exception as e:
-                st.error(f"Error: {str(e)}")
-                st.error("Please check your HuggingFace token and model access permissions")
 if __name__ == "__main__":
     main()

 from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+# Use environment variable for Hugging Face token
+HF_TOKEN = os.environ.get("HF_TOKEN")
+HUGGINGFACE_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
 DATA_PATH = "data/"
 DB_FAISS_PATH = "vectorstore/db_faiss"
 def load_pdf_files(data_path):
+    """Load PDF files from the specified directory"""
+    loader = DirectoryLoader(data_path,
+                           glob='*.pdf',
+                           loader_cls=PyPDFLoader)
+    documents = loader.load()
+    return documents
 def create_chunks(extracted_data):
+    """Split documents into chunks"""
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                                 chunk_overlap=50)
     text_chunks = text_splitter.split_documents(extracted_data)
     return text_chunks
 def get_embedding_model():
+    """Get the embedding model"""
     embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     return embedding_model
+def create_embeddings():
+    """Create embeddings and save to FAISS database"""
+    # Step 1: Load PDFs
+    documents = load_pdf_files(data_path=DATA_PATH)
+    st.info(f"Loaded {len(documents)} documents")
+    # Step 2: Create chunks
     text_chunks = create_chunks(extracted_data=documents)
     st.info(f"Created {len(text_chunks)} text chunks")
+    # Step 3: Get embedding model
     embedding_model = get_embedding_model()
+    # Step 4: Create and save embeddings
+    os.makedirs(os.path.dirname(DB_FAISS_PATH), exist_ok=True)
     db = FAISS.from_documents(text_chunks, embedding_model)
     db.save_local(DB_FAISS_PATH)
+    st.success("Embeddings created and saved successfully!")
     return db
+def set_custom_prompt(custom_prompt_template):
+    """Set custom prompt template"""
+    prompt = PromptTemplate(template=custom_prompt_template, input_variables=["context", "question"])
+    return prompt
+def load_llm(huggingface_repo_id):
+    """Load Hugging Face LLM"""
+    llm = HuggingFaceEndpoint(
+        repo_id=huggingface_repo_id,
+        task="text-generation",
+        temperature=0.5,
+        model_kwargs={
+            "token": HF_TOKEN,
+            "max_length": 512
+        }
+    )
+    return llm
 def get_vectorstore():
+    """Get or create vector store"""
     if os.path.exists(DB_FAISS_PATH):
+        st.info("Loading existing vector store...")
+        embedding_model = get_embedding_model()
         try:
             db = FAISS.load_local(DB_FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)
             return db
         except Exception as e:
             st.error(f"Error loading vector store: {e}")
+            st.info("Creating new vector store...")
+            return create_embeddings()
     else:
+        st.info("Creating new vector store...")
+        return create_embeddings()
 def main():
+    st.title("BeepKart FAQ Chatbot")
+    st.markdown("Ask questions about buying or selling bikes on BeepKart!")
+    # Initialize session state for messages
+    if 'messages' not in st.session_state:
+        st.session_state.messages = []
+    # Display chat history
+    for message in st.session_state.messages:
+        st.chat_message(message['role']).markdown(message['content'])
+    # Get user input
+    prompt = st.chat_input("Ask a question about BeepKart...")
+    # Custom prompt template
+    CUSTOM_PROMPT_TEMPLATE = """
+    Use the pieces of information provided in the context to answer user's question.
+    If you don't know the answer, just say that you don't know, don't try to make up an answer.
+    Don't provide anything out of the given context
+    Context: {context}
+    Question: {question}
+    Start the answer directly. No small talk please.
+    """
+    if prompt:
+        # Display user message
+        st.chat_message('user').markdown(prompt)
+        st.session_state.messages.append({'role': 'user', 'content': prompt})
+        try:
+            with st.spinner("Thinking..."):
+                # Get vector store
+                vectorstore = get_vectorstore()
+                # Create QA chain
+                qa_chain = RetrievalQA.from_chain_type(
+                    llm=load_llm(huggingface_repo_id=HUGGINGFACE_REPO_ID),
+                    chain_type="stuff",
+                    retriever=vectorstore.as_retriever(search_kwargs={'k': 3}),
+                    return_source_documents=True,
+                    chain_type_kwargs={'prompt': set_custom_prompt(CUSTOM_PROMPT_TEMPLATE)}
+                )
+                # Get response
+                response = qa_chain.invoke({'query': prompt})
+                # Extract result and sources
+                result = response["result"]
+                source_documents = response["source_documents"]
+                # Format source documents
+                source_docs_text = "\n\n**Sources:**\n"
+                for i, doc in enumerate(source_documents, 1):
+                    source_docs_text += f"{i}. Page {doc.metadata.get('page', 'N/A')}: {doc.page_content[:100]}...\n\n"
+                # Display result and sources
+                result_to_show = f"{result}\n{source_docs_text}"
+                st.chat_message('assistant').markdown(result_to_show)
+                st.session_state.messages.append({'role': 'assistant', 'content': result_to_show})
+        except Exception as e:
+            error_message = f"Error: {str(e)}"
+            st.error(error_message)
+            st.error("Please check your HuggingFace token and model access permissions")
+            st.session_state.messages.append({'role': 'assistant', 'content': error_message})
 if __name__ == "__main__":
     main()