Spaces:

NEXAS
/

docling_rag

Running

App Files Files Community

NEXAS commited on Mar 2

Commit

7e75a72

verified ·

1 Parent(s): 247c753

Upload 4 files

Browse files

Files changed (4) hide show

app.py +53 -0
utils/ingestion.py +119 -0
utils/llm.py +49 -0
utils/qa.py +58 -0

app.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import streamlit as st
+import os
+import json
+from ingestion import DocumentProcessor
+from llm import LLMProcessor
+from qa_engine import QAEngine
+# Set up Streamlit page
+st.set_page_config(page_title="AI-Powered Document QA", layout="wide")
+st.title("📄 AI-Powered Document QA")
+# Initialize processors
+document_processor = DocumentProcessor()
+llm_processor = LLMProcessor()
+qa_engine = QAEngine()
+# File uploader
+st.sidebar.header("Upload a PDF")
+uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type=["pdf"])
+if uploaded_file:
+    # Save file to a temporary path
+    pdf_path = f"temp/{uploaded_file.name}"
+    os.makedirs("temp", exist_ok=True)
+    with open(pdf_path, "wb") as f:
+        f.write(uploaded_file.read())
+    st.sidebar.success("✅ File uploaded successfully!")
+    # Process the document
+    with st.spinner("🔄 Processing document..."):
+        document_processor.process_document(pdf_path)
+    st.sidebar.success("✅ Document processed successfully!")
+    # Query input
+    question = st.text_input("Ask a question from the document:", placeholder="What are the key insights?")
+    if st.button("🔍 Search & Answer"):
+        if question:
+            with st.spinner("🧠 Searching for relevant context..."):
+                answer = qa_engine.query(question)
+            st.subheader("📝 Answer:")
+            st.write(answer)
+        else:
+            st.warning("⚠️ Please enter a question.")
+# Footer
+st.markdown("---")
+st.caption("🤖 Powered by ChromaDB + Groq LLM | Built with ❤️ using Streamlit")

utils/ingestion.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import json
+import time
+import os
+from pathlib import Path
+from typing import Dict, Any, List
+from tempfile import mkdtemp
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    PdfPipelineOptions,
+    TableFormerMode
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
+from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
+import chromadb
+class DocumentProcessor:
+    def __init__(self):
+        """Initialize document processor with necessary components"""
+        self.setup_document_converter()
+        self.embed_model = FastEmbedEmbeddings()
+        self.client = chromadb.PersistentClient(path=mkdtemp())  # Persistent storage
+    def setup_document_converter(self):
+        """Configure document converter with advanced processing capabilities"""
+        pipeline_options = PdfPipelineOptions()
+        pipeline_options.do_ocr = True
+        pipeline_options.do_table_structure = True
+        pipeline_options.table_structure_options.do_cell_matching = True
+        pipeline_options.ocr_options.lang = ["en"]
+        pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
+        pipeline_options.accelerator_options = AcceleratorOptions(
+            num_threads=8, device=AcceleratorDevice.MPS
+        )
+        self.converter = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(
+                    pipeline_options=pipeline_options,
+                    backend=PyPdfiumDocumentBackend
+                )
+            }
+        )
+    def extract_chunk_metadata(self, chunk) -> Dict[str, Any]:
+        """Extract essential metadata from a chunk"""
+        metadata = {
+            "text": chunk.text,
+            "headings": [],
+            "page_info": None,
+            "content_type": None
+        }
+        if hasattr(chunk, 'meta'):
+            if hasattr(chunk.meta, 'headings') and chunk.meta.headings:
+                metadata["headings"] = chunk.meta.headings
+            if hasattr(chunk.meta, 'doc_items'):
+                for item in chunk.meta.doc_items:
+                    if hasattr(item, 'label'):
+                        metadata["content_type"] = str(item.label)
+                    if hasattr(item, 'prov') and item.prov:
+                        for prov in item.prov:
+                            if hasattr(prov, 'page_no'):
+                                metadata["page_info"] = prov.page_no
+        return metadata
+    def process_document(self, pdf_path: str) -> Any:
+        """Process document and create searchable index with metadata"""
+        print(f"Processing document: {pdf_path}")
+        start_time = time.time()
+        result = self.converter.convert(pdf_path)
+        doc = result.document
+        chunker = HybridChunker(tokenizer="jinaai/jina-embeddings-v3")
+        chunks = list(chunker.chunk(doc))
+        processed_chunks = []
+        for chunk in chunks:
+            metadata = self.extract_chunk_metadata(chunk)
+            processed_chunks.append(metadata)
+        print("\nCreating vector database...")
+        collection = self.client.get_or_create_collection(name="document_chunks")
+        documents = []
+        embeddings = []
+        metadata_list = []
+        ids = []
+        for idx, chunk in enumerate(processed_chunks):
+            embedding = self.embed_model.encode(chunk['text'])
+            documents.append(chunk['text'])
+            embeddings.append(embedding)
+            metadata_list.append({
+                "headings": json.dumps(chunk['headings']),
+                "page": chunk['page_info'],
+                "content_type": chunk['content_type']
+            })
+            ids.append(str(idx))
+        collection.add(
+            ids=ids,
+            embeddings=embeddings,
+            documents=documents,
+            metadatas=metadata_list
+        )
+        processing_time = time.time() - start_time
+        print(f"\nDocument processing completed in {processing_time:.2f} seconds")
+        return collection

utils/llm.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
+from langchain_groq import ChatGroq
+import os
+import json
+from typing import List, Dict
+class LLMProcessor:
+    def __init__(self):
+        """Initialize embedding model and Groq LLM"""
+        self.api_key = os.getenv("GROQ_API_KEY")
+        # Use FastEmbed instead of SentenceTransformer
+        self.embed_model = FastEmbedEmbeddings()
+        self.llm = ChatGroq(
+            model_name="mixtral-8x7b-32768",
+            api_key=self.api_key
+        )
+    def format_context(self, chunks: List[Dict]) -> str:
+        """Format retrieved chunks into a structured context for the LLM"""
+        context_parts = []
+        for chunk in chunks:
+            try:
+                headings = json.loads(chunk['headings'])
+                if headings:
+                    context_parts.append(f"Section: {' > '.join(headings)}")
+            except:
+                pass
+            if chunk['page']:
+                context_parts.append(f"Page {chunk['page']}:")
+            context_parts.append(chunk['text'])
+            context_parts.append("-" * 40)
+        return "\n".join(context_parts)
+    def generate_answer(self, context: str, question: str) -> str:
+        """Generate answer using structured context"""
+        prompt = f"""Based on the following excerpts from a document:
+{context}
+Please answer this question: {question}
+Make use of the section information and page numbers in your answer when relevant.
+"""
+        return self.llm.invoke(prompt)

utils/qa.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import logging
+from ingestion import DocumentProcessor
+from llm import LLMProcessor
+class QAEngine:
+    def __init__(self):
+        self.processor = DocumentProcessor()
+        self.llm_processor = LLMProcessor()
+    def query(self, question: str, k: int = 5) -> str:
+        """Query the document using semantic search and generate an answer"""
+        query_embedding = self.llm_processor.embed_model.encode(question)
+        # Corrected ChromaDB query syntax
+        results = self.processor.index.query(
+            query_embeddings=[query_embedding],
+            n_results=k
+        )
+        # Extracting results properly
+        chunks = []
+        for i in range(len(results["documents"][0])):  # Iterate over top-k results
+            chunks.append({
+                "text": results["documents"][0][i],
+                "headings": results["metadatas"][0][i].get("headings", "[]"),
+                "page": results["metadatas"][0][i].get("page"),
+                "content_type": results["metadatas"][0][i].get("content_type")
+            })
+        print(f"\nRelevant chunks for query: '{question}'")
+        print("=" * 80)
+        context = self.llm_processor.format_context(chunks)
+        print(context)
+        return self.llm_processor.generate_answer(context, question)
+# def main():
+#     logging.basicConfig(level=logging.INFO)
+#     processor = DocumentProcessor()
+#     pdf_path = "sample/InternLM.pdf"
+#     processor.process_document(pdf_path)
+#     qa_engine = QAEngine()
+#     question = "What are the main features of InternLM-XComposer-2.5?"
+#     answer = qa_engine.query(question)
+#     print("\nAnswer:")
+#     print("=" * 80)
+#     print(answer)
+# if __name__ == "__main__":
+#     main()