NEXAS commited on
Commit
7e75a72
·
verified ·
1 Parent(s): 247c753

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +53 -0
  2. utils/ingestion.py +119 -0
  3. utils/llm.py +49 -0
  4. utils/qa.py +58 -0
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import json
4
+ from ingestion import DocumentProcessor
5
+ from llm import LLMProcessor
6
+ from qa_engine import QAEngine
7
+
8
+ # Set up Streamlit page
9
+ st.set_page_config(page_title="AI-Powered Document QA", layout="wide")
10
+ st.title("📄 AI-Powered Document QA")
11
+
12
+ # Initialize processors
13
+ document_processor = DocumentProcessor()
14
+ llm_processor = LLMProcessor()
15
+ qa_engine = QAEngine()
16
+
17
+ # File uploader
18
+ st.sidebar.header("Upload a PDF")
19
+ uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type=["pdf"])
20
+
21
+ if uploaded_file:
22
+ # Save file to a temporary path
23
+ pdf_path = f"temp/{uploaded_file.name}"
24
+ os.makedirs("temp", exist_ok=True)
25
+
26
+ with open(pdf_path, "wb") as f:
27
+ f.write(uploaded_file.read())
28
+
29
+ st.sidebar.success("✅ File uploaded successfully!")
30
+
31
+ # Process the document
32
+ with st.spinner("🔄 Processing document..."):
33
+ document_processor.process_document(pdf_path)
34
+
35
+ st.sidebar.success("✅ Document processed successfully!")
36
+
37
+ # Query input
38
+ question = st.text_input("Ask a question from the document:", placeholder="What are the key insights?")
39
+
40
+ if st.button("🔍 Search & Answer"):
41
+ if question:
42
+ with st.spinner("🧠 Searching for relevant context..."):
43
+ answer = qa_engine.query(question)
44
+
45
+ st.subheader("📝 Answer:")
46
+ st.write(answer)
47
+
48
+ else:
49
+ st.warning("⚠️ Please enter a question.")
50
+
51
+ # Footer
52
+ st.markdown("---")
53
+ st.caption("🤖 Powered by ChromaDB + Groq LLM | Built with ❤️ using Streamlit")
utils/ingestion.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import time
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Dict, Any, List
6
+ from tempfile import mkdtemp
7
+
8
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
9
+ from docling.datamodel.base_models import InputFormat
10
+ from docling.datamodel.pipeline_options import (
11
+ AcceleratorDevice,
12
+ AcceleratorOptions,
13
+ PdfPipelineOptions,
14
+ TableFormerMode
15
+ )
16
+ from docling.document_converter import DocumentConverter, PdfFormatOption
17
+ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
18
+ from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
19
+ import chromadb
20
+
21
+
22
+ class DocumentProcessor:
23
+ def __init__(self):
24
+ """Initialize document processor with necessary components"""
25
+ self.setup_document_converter()
26
+ self.embed_model = FastEmbedEmbeddings()
27
+ self.client = chromadb.PersistentClient(path=mkdtemp()) # Persistent storage
28
+
29
+ def setup_document_converter(self):
30
+ """Configure document converter with advanced processing capabilities"""
31
+ pipeline_options = PdfPipelineOptions()
32
+ pipeline_options.do_ocr = True
33
+ pipeline_options.do_table_structure = True
34
+ pipeline_options.table_structure_options.do_cell_matching = True
35
+ pipeline_options.ocr_options.lang = ["en"]
36
+ pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
37
+ pipeline_options.accelerator_options = AcceleratorOptions(
38
+ num_threads=8, device=AcceleratorDevice.MPS
39
+ )
40
+
41
+ self.converter = DocumentConverter(
42
+ format_options={
43
+ InputFormat.PDF: PdfFormatOption(
44
+ pipeline_options=pipeline_options,
45
+ backend=PyPdfiumDocumentBackend
46
+ )
47
+ }
48
+ )
49
+
50
+ def extract_chunk_metadata(self, chunk) -> Dict[str, Any]:
51
+ """Extract essential metadata from a chunk"""
52
+ metadata = {
53
+ "text": chunk.text,
54
+ "headings": [],
55
+ "page_info": None,
56
+ "content_type": None
57
+ }
58
+
59
+ if hasattr(chunk, 'meta'):
60
+ if hasattr(chunk.meta, 'headings') and chunk.meta.headings:
61
+ metadata["headings"] = chunk.meta.headings
62
+
63
+ if hasattr(chunk.meta, 'doc_items'):
64
+ for item in chunk.meta.doc_items:
65
+ if hasattr(item, 'label'):
66
+ metadata["content_type"] = str(item.label)
67
+
68
+ if hasattr(item, 'prov') and item.prov:
69
+ for prov in item.prov:
70
+ if hasattr(prov, 'page_no'):
71
+ metadata["page_info"] = prov.page_no
72
+
73
+ return metadata
74
+
75
+ def process_document(self, pdf_path: str) -> Any:
76
+ """Process document and create searchable index with metadata"""
77
+ print(f"Processing document: {pdf_path}")
78
+ start_time = time.time()
79
+
80
+ result = self.converter.convert(pdf_path)
81
+ doc = result.document
82
+
83
+ chunker = HybridChunker(tokenizer="jinaai/jina-embeddings-v3")
84
+ chunks = list(chunker.chunk(doc))
85
+
86
+ processed_chunks = []
87
+ for chunk in chunks:
88
+ metadata = self.extract_chunk_metadata(chunk)
89
+ processed_chunks.append(metadata)
90
+
91
+ print("\nCreating vector database...")
92
+ collection = self.client.get_or_create_collection(name="document_chunks")
93
+
94
+ documents = []
95
+ embeddings = []
96
+ metadata_list = []
97
+ ids = []
98
+
99
+ for idx, chunk in enumerate(processed_chunks):
100
+ embedding = self.embed_model.encode(chunk['text'])
101
+ documents.append(chunk['text'])
102
+ embeddings.append(embedding)
103
+ metadata_list.append({
104
+ "headings": json.dumps(chunk['headings']),
105
+ "page": chunk['page_info'],
106
+ "content_type": chunk['content_type']
107
+ })
108
+ ids.append(str(idx))
109
+
110
+ collection.add(
111
+ ids=ids,
112
+ embeddings=embeddings,
113
+ documents=documents,
114
+ metadatas=metadata_list
115
+ )
116
+
117
+ processing_time = time.time() - start_time
118
+ print(f"\nDocument processing completed in {processing_time:.2f} seconds")
119
+ return collection
utils/llm.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
2
+ from langchain_groq import ChatGroq
3
+ import os
4
+ import json
5
+ from typing import List, Dict
6
+
7
+ class LLMProcessor:
8
+ def __init__(self):
9
+ """Initialize embedding model and Groq LLM"""
10
+ self.api_key = os.getenv("GROQ_API_KEY")
11
+
12
+ # Use FastEmbed instead of SentenceTransformer
13
+ self.embed_model = FastEmbedEmbeddings()
14
+
15
+ self.llm = ChatGroq(
16
+ model_name="mixtral-8x7b-32768",
17
+ api_key=self.api_key
18
+ )
19
+
20
+ def format_context(self, chunks: List[Dict]) -> str:
21
+ """Format retrieved chunks into a structured context for the LLM"""
22
+ context_parts = []
23
+ for chunk in chunks:
24
+ try:
25
+ headings = json.loads(chunk['headings'])
26
+ if headings:
27
+ context_parts.append(f"Section: {' > '.join(headings)}")
28
+ except:
29
+ pass
30
+
31
+ if chunk['page']:
32
+ context_parts.append(f"Page {chunk['page']}:")
33
+
34
+ context_parts.append(chunk['text'])
35
+ context_parts.append("-" * 40)
36
+
37
+ return "\n".join(context_parts)
38
+
39
+ def generate_answer(self, context: str, question: str) -> str:
40
+ """Generate answer using structured context"""
41
+ prompt = f"""Based on the following excerpts from a document:
42
+
43
+ {context}
44
+
45
+ Please answer this question: {question}
46
+
47
+ Make use of the section information and page numbers in your answer when relevant.
48
+ """
49
+ return self.llm.invoke(prompt)
utils/qa.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from ingestion import DocumentProcessor
3
+ from llm import LLMProcessor
4
+
5
+
6
+ class QAEngine:
7
+ def __init__(self):
8
+ self.processor = DocumentProcessor()
9
+ self.llm_processor = LLMProcessor()
10
+
11
+ def query(self, question: str, k: int = 5) -> str:
12
+ """Query the document using semantic search and generate an answer"""
13
+ query_embedding = self.llm_processor.embed_model.encode(question)
14
+
15
+ # Corrected ChromaDB query syntax
16
+ results = self.processor.index.query(
17
+ query_embeddings=[query_embedding],
18
+ n_results=k
19
+ )
20
+
21
+ # Extracting results properly
22
+ chunks = []
23
+ for i in range(len(results["documents"][0])): # Iterate over top-k results
24
+ chunks.append({
25
+ "text": results["documents"][0][i],
26
+ "headings": results["metadatas"][0][i].get("headings", "[]"),
27
+ "page": results["metadatas"][0][i].get("page"),
28
+ "content_type": results["metadatas"][0][i].get("content_type")
29
+ })
30
+
31
+ print(f"\nRelevant chunks for query: '{question}'")
32
+ print("=" * 80)
33
+
34
+ context = self.llm_processor.format_context(chunks)
35
+ print(context)
36
+
37
+ return self.llm_processor.generate_answer(context, question)
38
+
39
+
40
+ # def main():
41
+ # logging.basicConfig(level=logging.INFO)
42
+
43
+ # processor = DocumentProcessor()
44
+
45
+ # pdf_path = "sample/InternLM.pdf"
46
+ # processor.process_document(pdf_path)
47
+
48
+ # qa_engine = QAEngine()
49
+ # question = "What are the main features of InternLM-XComposer-2.5?"
50
+ # answer = qa_engine.query(question)
51
+
52
+ # print("\nAnswer:")
53
+ # print("=" * 80)
54
+ # print(answer)
55
+
56
+
57
+ # if __name__ == "__main__":
58
+ # main()