Spaces:

sango07
/

Chat_with_multiple_PDF

Sleeping

App Files Files Community

sango07 commited on Dec 17, 2024

Commit

7541c6e

verified ·

1 Parent(s): 8c7b46a

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -163

app.py CHANGED Viewed

@@ -1,176 +1,74 @@
-import streamlit as st
-from dotenv import load_dotenv
-import os
-from langchain.embeddings import HuggingFaceEmbeddings
-from htmlTemplate import css, bot_template, user_template
-import PyPDF2
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
-from langchain_community.llms import LlamaCpp
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.vectorstores import FAISS
-from langchain.memory import ConversationBufferMemory
-from langchain.chains import ConversationalRetrievalChain
-from langchain.prompts import PromptTemplate
-from sentence_transformers import SentenceTransformer, util
-#from langchain_openai import AzureOpenAIEmbeddings
-#from langchain_openai import OpenAIEmbeddings
-from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
-#from langchain_openai import ChatOpenAI
-os.environ["OPENAI_API_KEY"] = "sk-.............."
-import os
-os.environ["GROQ_API_KEY"]=os.getenv('GROQ_API_KEY')
-from langchain_groq import ChatGroq
-llmtemplate = """You’re an AI information specialist with a strong emphasis on extracting accurate information from markdown documents. Your expertise involves summarizing data succinctly while adhering to strict guidelines about neutrality and clarity.
-Your task is to answer a specific question based on a provided markdown document. Here is the question you need to address:
-{question}
-Keep in mind the following instructions:
-- Your response should be direct and factual, limited to 50 words and 2-3 sentences.
-- Avoid using introductory phrases like "yes" or "no."
-- Maintain an ethical and unbiased tone, steering clear of harmful or offensive content.
-- If the document lacks relevant information, respond with "I cannot provide an answer based on the provided document."
-- Do not fabricate information, include questions, or use confirmatory phrases.
-- Remember not to prompt for additional information or ask any questions.
-Ensure your response is strictly based on the content of the markdown document.
-    """
-def prepare_docs(pdf_docs):
-    docs = []
-    metadata = []
-    content = []
-    for pdf in pdf_docs:
-        print(pdf.name)
-        pdf_reader = PyPDF2.PdfReader(pdf)
-        for index, text in enumerate(pdf_reader.pages):
-            doc_page = {'title': pdf.name + " page " + str(index + 1),
-                        'content': pdf_reader.pages[index].extract_text()}
-            docs.append(doc_page)
-    for doc in docs:
-        content.append(doc["content"])
-        metadata.append({
-            "title": doc["title"]
-        })
-    return content, metadata
-def get_text_chunks(content, metadata):
-    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
-        chunk_size=1024,
-        chunk_overlap=256,
-    )
-    split_docs = text_splitter.create_documents(content, metadatas=metadata)
-    print(f"Split documents into {len(split_docs)} passages")
-    return split_docs
-def ingest_into_vectordb(split_docs):
-    # embeddings = OpenAIEmbeddings()
-    # embeddings = FastEmbedEmbeddings()
-    # embeddings = SpacyEmbeddings(model_name="en_core_web_sm")
-    embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
-                                     model_kwargs={'device':'cpu'})
-    db = FAISS.from_documents(split_docs, embeddings)
-    DB_FAISS_PATH = 'vectorstore/db_faiss'
-    db.save_local(DB_FAISS_PATH)
-    return db
-def get_conversation_chain(vectordb):
-    # llama_llm = ChatOpenAI(temperature=0.7, model="gpt-3.5-turbo")
-    llm = ChatGroq(model="llama3-70b-8192", temperature=0.25)
-    retriever = vectordb.as_retriever()
-    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(llmtemplate)
-    memory = ConversationBufferMemory(
-        memory_key='chat_history', return_messages=True, output_key='answer')
-    conversation_chain = (ConversationalRetrievalChain.from_llm
-                          (llm=llm,
-                           retriever=retriever,
-                           #condense_question_prompt=CONDENSE_QUESTION_PROMPT,
-                           memory=memory,
-                           return_source_documents=True))
-    print("Conversational Chain created for the LLM using the vector store")
-    return conversation_chain
-def validate_answer_against_sources(response_answer, source_documents):
-    model = SentenceTransformer('all-MiniLM-L6-v2')
-    similarity_threshold = 0.5
-    source_texts = [doc.page_content for doc in source_documents]
-    answer_embedding = model.encode(response_answer, convert_to_tensor=True)
-    source_embeddings = model.encode(source_texts, convert_to_tensor=True)
-    cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)
-    if any(score.item() > similarity_threshold for score in cosine_scores[0]):
-        return True
-    return False
-def handle_userinput(user_question):
-    response = st.session_state.conversation({'question': user_question})
-    st.session_state.chat_history = response['chat_history']
-    for i, message in enumerate(st.session_state.chat_history):
-        print(i)
-        if i % 2 == 0:
-            st.write(user_template.replace(
-                "{{MSG}}", message.content), unsafe_allow_html=True)
-        else:
-            print(message.content)
-            st.write(bot_template.replace(
-                "{{MSG}}", message.content), unsafe_allow_html=True)
 def main():
     load_dotenv()
-    st.set_page_config(page_title="Chat with your PDFs",
-                       page_icon=":books:")
     st.write(css, unsafe_allow_html=True)
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = []
-    st.header("Chat with multiple PDFs :books:")
-    user_question = st.text_input("Ask a question about your documents:")
-    if user_question:
-        handle_userinput(user_question)
     with st.sidebar:
-        st.subheader("Your documents")
         pdf_docs = st.file_uploader(
-            "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
-        if st.button("Process"):
-            with st.spinner("Processing"):
-                # get pdf text
-                content, metadata = prepare_docs(pdf_docs)
-                # get the text chunks
-                split_docs = get_text_chunks(content, metadata)
-                # create vector store
-                vectorstore = ingest_into_vectordb(split_docs)
-                # create conversation chain
-                st.session_state.conversation = get_conversation_chain(
-                    vectorstore)
-if __name__ == '__main__':
-    main()

 def main():
     load_dotenv()
+    st.set_page_config(
+        page_title="PDF Insights AI",
+        page_icon=":books:",
+        layout="wide"
+    )
     st.write(css, unsafe_allow_html=True)
+    # Welcome section
+    st.title("📚 PDF Insights AI")
+    st.markdown("""
+    ### Unlock the Knowledge in Your PDFs
+    - 🤖 AI-powered document analysis
+    - 💬 Ask questions about your uploaded documents
+    - 📄 Support for multiple PDF files
+    """)
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = []
+    # File upload section
     with st.sidebar:
+        st.header("📤 Upload Documents")
         pdf_docs = st.file_uploader(
+            "Upload your PDFs here",
+            type=['pdf'],
+            accept_multiple_files=True,
+            help="Upload PDF files to analyze. Max file size: 200MB"
+        )
+        # File validation
+        if pdf_docs:
+            for doc in pdf_docs:
+                if doc.size > 200 * 1024 * 1024:  # 200 MB
+                    st.error(f"File {doc.name} is too large. Maximum file size is 200MB.")
+                    pdf_docs.remove(doc)
+        if st.button("Process Documents", type="primary"):
+            if not pdf_docs:
+                st.warning("Please upload at least one PDF file.")
+            else:
+                with st.spinner("Processing your documents..."):
+                    try:
+                        # get pdf text
+                        content, metadata = prepare_docs(pdf_docs)
+                        # get the text chunks
+                        split_docs = get_text_chunks(content, metadata)
+                        # create vector store
+                        vectorstore = ingest_into_vectordb(split_docs)
+                        # create conversation chain
+                        st.session_state.conversation = get_conversation_chain(vectorstore)
+                        st.success("Documents processed successfully! You can now ask questions.")
+                    except Exception as e:
+                        st.error(f"An error occurred while processing documents: {str(e)}")
+    # Question input section
+    user_question = st.text_input(
+        "📝 Ask a question about your documents",
+        placeholder="What insights can you provide from these documents?"
+    )
+    if user_question:
+        if st.session_state.conversation is None:
+            st.warning("Please upload and process documents first.")
+        else:
+            handle_userinput(user_question)