Spaces:

mercybabs
/

chat_with_pdf

Runtime error

App Files Files Community

mercybabs commited on Mar 22

Commit

44cbe88

1 Parent(s): 04ad69c

Initial commit - added my RAG Streamlit app

Browse files

Files changed (3) hide show

Dockerfile +23 -0
app.py +286 -0
requirements.txt +152 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+# Use an official Python image
+FROM python:3.12
+# Set working directory inside the container
+WORKDIR /app
+# Copy the application files into the container
+COPY . /app
+# Install system dependencies
+RUN apt update && apt install -y \
+    tesseract-ocr \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Expose the port Streamlit runs on
+EXPOSE 8501
+# Set the default command to run the app
+CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

app.py ADDED Viewed

	@@ -0,0 +1,286 @@

+#Import Library
+from unstructured.partition.pdf import partition_pdf
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import SystemMessage, HumanMessage
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain.schema.runnable import RunnablePassthrough,RunnableLambda
+from langchain_postgres.vectorstores import PGVector
+from database import COLLECTION_NAME, CONNECTION_STRING
+from langchain_community.storage import RedisStore
+from langchain.schema.document import Document
+from langchain_openai import OpenAIEmbeddings
+from langchain.retrievers.multi_vector import MultiVectorRetriever
+from pathlib import Path
+from IPython.display import display, HTML
+from base64 import b64decode
+import os, hashlib, shutil, uuid, json, time
+import torch, redis, streamlit as st
+import logging
+import openai
+# from dotenv import load_dotenv
+# load_dotenv()
+openai_api_key = os.getenv("OPENAI_API_KEY")
+# Ensure PyTorch module path is correctly set
+torch.classes.__path__ = [os.path.join(torch.__path__[0], torch.classes.__file__)]
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+# Initialize Redis client
+client = redis.Redis(host="localhost", port=6379, db=0)
+#Data Loading
+def load_pdf_data(file_path):
+    logging.info(f"Data ready to be partitioned and loaded ")
+    raw_pdf_elements = partition_pdf(
+        filename=file_path,
+        infer_table_structure=True,
+        strategy = "hi_res",
+        extract_image_block_types = ["Image"],
+        extract_image_block_to_payload  = True,
+        chunking_strategy="by_title",
+        mode='elements',
+        max_characters=10000,
+        new_after_n_chars=5000,
+        combine_text_under_n_chars=2000,
+        image_output_dir_path="data/",
+    )
+    logging.info(f"Pdf data finish loading, chunks now available!")
+    return raw_pdf_elements
+# Generate a unique hash for a PDF file
+def get_pdf_hash(pdf_path):
+    """Generate a SHA-256 hash of the PDF file content."""
+    with open(pdf_path, "rb") as f:
+        pdf_bytes = f.read()
+    return hashlib.sha256(pdf_bytes).hexdigest()
+# Summarize extracted text and tables using LLM
+def summarize_text_and_tables(text, tables):
+    logging.info("Ready to summarize data with LLM")
+    prompt_text = """You are an assistant tasked with summarizing text and tables. \
+                    You are to give a concise summary of the table or text and do nothing else.
+                    Table or text chunk: {element} """
+    prompt = ChatPromptTemplate.from_template(prompt_text)
+    model = ChatOpenAI(temperature=0.6, model="gpt-4o-mini", openai_api_key=openai_api_key)
+    summarize_chain = {"element": RunnablePassthrough()}| prompt | model | StrOutputParser()
+    logging.info(f"{model} done with summarization")
+    return {
+        "text": summarize_chain.batch(text, {"max_concurrency": 5}),
+        "table": summarize_chain.batch(tables, {"max_concurrency": 5})
+    }
+#Initialize a pgvector and retriever for storing and searching documents
+def initialize_retriever():
+    store = RedisStore(client=client)
+    id_key = "doc_id"
+    vectorstore = PGVector(
+            embeddings=OpenAIEmbeddings(),
+            collection_name=COLLECTION_NAME,
+            connection=CONNECTION_STRING,
+            use_jsonb=True,
+            )
+    retrieval_loader = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key="doc_id")
+    return retrieval_loader
+# Store text, tables, and their summaries in the retriever
+def store_docs_in_retriever(text, text_summary, table, table_summary, retriever):
+    """Store text and table documents along with their summaries in the retriever."""
+    def add_documents_to_retriever(documents, summaries, retriever, id_key = "doc_id"):
+        """Helper function to add documents and their summaries to the retriever."""
+        if not summaries:
+            return None, []
+        doc_ids = [str(uuid.uuid4()) for _ in documents]
+        summary_docs = [
+            Document(page_content=summary, metadata={id_key: doc_ids[i]})
+            for i, summary in enumerate(summaries)
+        ]
+        retriever.vectorstore.add_documents(summary_docs, ids=doc_ids)
+        retriever.docstore.mset(list(zip(doc_ids, documents)))
+# Add text, table, and image summaries to the retriever
+    add_documents_to_retriever(text, text_summary, retriever)
+    add_documents_to_retriever(table, table_summary, retriever)
+    return retriever
+# Parse the retriever output
+def parse_retriver_output(data):
+    parsed_elements = []
+    for element in data:
+        # Decode bytes to string if necessary
+        if isinstance(element, bytes):
+            element = element.decode("utf-8")
+        parsed_elements.append(element)
+    return parsed_elements
+# Chat with the LLM using retrieved context
+def chat_with_llm(retriever):
+    logging.info(f"Context ready to send to LLM ")
+    prompt_text = """
+                You are an AI Assistant tasked with understanding detailed
+                information from text and tables. You are to answer the question based on the
+                context provided to you. You must not go beyond the context given to you.
+                Context:
+                {context}
+                Question:
+                {question}
+                """
+    prompt = ChatPromptTemplate.from_template(prompt_text)
+    model = ChatOpenAI(temperature=0.6, model="gpt-4o-mini", openai_api_key=openai_api_key)
+    rag_chain = ({
+       "context": retriever | RunnableLambda(parse_retriver_output), "question": RunnablePassthrough(),
+        }
+        | prompt
+        | model
+        | StrOutputParser()
+        )
+    logging.info(f"Completed! ")
+    return rag_chain
+# Generate temporary file path of uploaded docs
+def _get_file_path(file_upload):
+    temp_dir = "temp"
+    os.makedirs(temp_dir, exist_ok=True)  # Ensure the directory exists
+    if isinstance(file_upload, str):
+        file_path = file_upload  # Already a string path
+    else:
+        file_path = os.path.join(temp_dir, file_upload.name)
+        with open(file_path, "wb") as f:
+            f.write(file_upload.getbuffer())
+        return file_path
+# Process uploaded PDF file
+def process_pdf(file_upload):
+    print('Processing PDF hash info...')
+    file_path =  _get_file_path(file_upload)
+    pdf_hash = get_pdf_hash(file_path)
+    load_retriever = initialize_retriever()
+    existing = client.exists(f"pdf:{pdf_hash}")
+    print(f"Checking Redis for hash {pdf_hash}: {'Exists' if existing else 'Not found'}")
+    if existing:
+        print(f"PDF already exists with hash {pdf_hash}. Skipping upload.")
+        return load_retriever
+    print(f"New PDF detected. Processing... {pdf_hash}")
+    pdf_elements = load_pdf_data(file_path)
+    tables = [element.metadata.text_as_html for element in
+               pdf_elements if 'Table' in str(type(element))]
+    text = [element.text for element in pdf_elements if
+            'CompositeElement' in str(type(element))]
+    summaries = summarize_text_and_tables(text, tables)
+    retriever = store_docs_in_retriever(text, summaries['text'], tables,  summaries['table'], load_retriever)
+    # Store the PDF hash in Redis
+    client.set(f"pdf:{pdf_hash}", json.dumps({"text": "PDF processed"}))
+    # Debug: Check if Redis stored the key
+    stored = client.exists(f"pdf:{pdf_hash}")
+    # #remove temp directory
+    # shutil.rmtree("dir")
+    print(f"Stored PDF hash in Redis: {'Success' if stored else 'Failed'}")
+    return retriever
+#Invoke chat with LLM based on uploaded PDF and user query
+def invoke_chat(file_upload, message):
+    retriever =process_pdf(file_upload)
+    rag_chain = chat_with_llm(retriever)
+    response = rag_chain.invoke(message)
+    response_placeholder = st.empty()
+    response_placeholder.write(response)
+    return response
+# Main application interface using Streamlit
+def main():
+    st.title("PDF Chat Assistant ")
+    logging.info("App started")
+    if 'messages' not in st.session_state:
+        st.session_state.messages = []
+    file_upload = st.sidebar.file_uploader(
+    label="Upload", type=["pdf"],
+    accept_multiple_files=False,
+    key="pdf_uploader"
+    )
+    if file_upload:
+        st.success("File uploaded successfully! You can now ask your question.")
+    # Prompt for user input
+    if prompt := st.chat_input("Your question"):
+        st.session_state.messages.append({"role": "user", "content": prompt})
+    # Display chat history
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.write(message["content"])
+    # Generate response if last message is not from assistant
+    if st.session_state.messages and st.session_state.messages[-1]["role"] != "assistant":
+        with st.chat_message("assistant"):
+            start_time = time.time()
+            logging.info("Generating response...")
+            with st.spinner("Processing..."):
+                user_message = " ".join([msg["content"] for msg in st.session_state.messages if msg])
+                response_message = invoke_chat(file_upload, user_message)
+                duration = time.time() - start_time
+                response_msg_with_duration = f"{response_message}\n\nDuration: {duration:.2f} seconds"
+                st.session_state.messages.append({"role": "assistant", "content": response_msg_with_duration})
+                st.write(f"Duration: {duration:.2f} seconds")
+                logging.info(f"Response: {response_message}, Duration: {duration:.2f} s")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,152 @@

+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.14
+aiosignal==1.3.2
+altair==5.5.0
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+backoff==2.2.1
+blinker==1.9.0
+cachetools==5.5.2
+chardet==5.2.0
+click==8.1.8
+coloredlogs==15.0.1
+contourpy==1.3.1
+cycler==0.12.1
+dataclasses-json==0.6.7
+Deprecated==1.2.18
+distro==1.9.0
+effdet==0.4.1
+emoji==2.14.1
+et_xmlfile==2.0.0
+eval_type_backport==0.2.2
+filetype==1.2.0
+flatbuffers==25.2.10
+fonttools==4.56.0
+frozenlist==1.5.0
+fsspec==2025.3.0
+gitdb==4.0.12
+GitPython==3.1.44
+google-api-core==2.24.2
+google-auth==2.38.0
+google-cloud-vision==3.10.1
+googleapis-common-protos==1.69.2
+greenlet==3.1.1
+grpcio==1.71.0
+grpcio-status==1.71.0
+html5lib==1.1
+httpx-sse==0.4.0
+huggingface-hub==0.29.3
+humanfriendly==10.0
+iopath==0.1.10
+jiter==0.9.0
+joblib==1.4.2
+jsonpatch==1.33
+kiwisolver==1.4.8
+langchain==0.3.20
+langchain-community==0.3.19
+langchain-core==0.3.45
+langchain-openai==0.3.9
+langchain-postgres==0.0.13
+langchain-text-splitters==0.3.6
+langdetect==1.0.9
+langsmith==0.3.15
+layoutparser==0.3.4
+lxml==5.3.1
+Markdown==3.7
+marshmallow==3.26.1
+matplotlib==3.10.1
+mpmath==1.3.0
+multidict==6.2.0
+narwhals==1.31.0
+networkx==3.4.2
+nltk==3.9.1
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+olefile==0.47
+omegaconf==2.3.0
+onnx==1.17.0
+onnxruntime==1.21.0
+openai==1.66.3
+opencv-python==4.11.0.86
+openpyxl==3.1.5
+orjson==3.10.15
+pandas==2.2.3
+pdf2image==1.17.0
+pdfminer.six==20231228
+pdfplumber==0.11.5
+pgvector==0.3.6
+pi_heif==0.22.0
+pikepdf==9.5.2
+pillow==11.1.0
+portalocker==3.1.1
+propcache==0.3.0
+proto-plus==1.26.1
+protobuf==5.29.3
+psycopg==3.2.6
+psycopg-pool==3.2.6
+pyarrow==19.0.1
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycocotools==2.0.8
+pydantic==2.10.6
+pydantic-settings==2.8.1
+pydantic_core==2.27.2
+pydeck==0.9.1
+pypandoc==1.15
+pyparsing==3.2.1
+pypdf==5.4.0
+pypdfium2==4.30.1
+python-docx==1.1.2
+python-dotenv==1.0.1
+python-iso639==2025.2.18
+python-magic==0.4.27
+python-multipart==0.0.20
+python-oxmsg==0.0.2
+python-pptx==1.0.2
+pytz==2025.1
+RapidFuzz==3.12.2
+redis==5.2.1
+regex==2024.11.6
+rsa==4.9
+safetensors==0.5.3
+scipy==1.15.2
+smmap==5.0.2
+SQLAlchemy==2.0.39
+streamlit==1.43.2
+sympy==1.13.1
+tenacity==9.0.0
+tiktoken==0.9.0
+timm==1.0.15
+tokenizers==0.21.1
+toml==0.10.2
+torch==2.6.0
+torchvision==0.21.0
+tqdm==4.67.1
+transformers==4.49.0
+triton==3.2.0
+typing-inspect==0.9.0
+typing-inspection==0.4.0
+tzdata==2025.1
+unstructured==0.16.10
+unstructured-client==0.31.1
+unstructured-inference==0.8.1
+unstructured.pytesseract==0.3.15
+watchdog==6.0.0
+wrapt==1.17.2
+xlrd==2.0.1
+XlsxWriter==3.2.2
+yarl==1.18.3
+zstandard==0.23.0