Spaces:

umarbalak
/

QueryMind

Sleeping

App Files Files Community

umarbalak commited on Apr 9

Commit

4ff3463

1 Parent(s): e226092

version 1

Browse files

Files changed (3) hide show

.gitignore +183 -0
app.py +235 -60
data_extraction.py +264 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,183 @@

+QueryMind
+*.log
+*.json
+*.docx
+*.pdf
+*.md
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc

app.py CHANGED Viewed

@@ -1,64 +1,239 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+from pathlib import Path
+from tempfile import mkdtemp
+# LangChain & Embedding/LLM
+from langchain_community.vectorstores import FAISS
+from langchain_huggingface.embeddings import HuggingFaceEmbeddings
+from langchain_huggingface import HuggingFaceEndpoint
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.prompts import PromptTemplate
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+# Custom Data Extractor
+from data_extraction import DataExtractor
+# Constants
+EMBED_MODEL_ID = "BAAI/bge-large-en-v1.5"
+GEN_MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+TOP_K = 5
+TEMP_DIR = Path(mkdtemp())
+milvus_uri = str(TEMP_DIR / "docling.db")
+# Initialize
+extractor = DataExtractor()
+embedding = HuggingFaceEmbeddings(model_name=EMBED_MODEL_ID)
+llm = HuggingFaceEndpoint(repo_id=GEN_MODEL_ID, task="text-generation")
+template = """You are a helpful assistant. Based on the following context, answer the user's query.
+If you don't know the answer, just say that you don't know. Don't try to make up an answer.
+Use the given context strictly to answer the question.
+<context>
+{context}
+</context>
+Question: {input}
+Helpful Answer:"""
+rag_prompt = PromptTemplate.from_template(template)
+# App State
+vectorstore = None
+retriever = None  # Store the retriever separately
+rag_chain = None
+# Function to process using direct extraction methods
+def process_direct(file_path):
+    text = ""
+    if file_path.suffix.lower() == ".pdf":
+        text = extractor.extract_text_from_pdf(file_path)
+    elif file_path.suffix.lower() == ".docx":
+        text = extractor.extract_text_from_doc(file_path)
+    elif file_path.suffix.lower() == ".txt":
+        text = extractor.extract_text_from_txt(file_path)
+    else:
+        return None, "Unsupported file format. Please use PDF, DOCX, or TXT."
+    # Clean text
+    text = extractor.clean_text(text)
+    # Split text using RecursiveCharacterTextSplitter
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len
+    )
+    from langchain_core.documents import Document
+    doc = Document(
+        page_content=text,
+        metadata={
+            "source": str(file_path),
+            "filename": file_path.name
+        }
+    )
+    return text_splitter.split_documents([doc]), "Direct extraction completed successfully."
+# Function to process using docling
+def process_docling(file_path):
+    try:
+        if file_path.suffix.lower() == ".pdf":
+            # Convert PDF to markdown
+            markdown_path = TEMP_DIR / "temp.md"
+            extractor.pdf_to_markdown(file_path, markdown_path)
+            # Load markdown
+            markdown_raw = extractor.load_markdown_file(markdown_path)
+        elif file_path.suffix.lower() == ".md":
+            markdown_raw = extractor.load_markdown_file(file_path)
+        else:
+            return None, "Only PDF or Markdown files are supported for docling extraction."
+        # Update metadata with filename
+        for doc in markdown_raw:
+            doc.metadata.update({
+                "filename": file_path.name
+            })
+        # Split markdown
+        splits = extractor.spit_markdown(markdown_raw)
+        # Update metadata for all splits
+        for split in splits:
+            split.metadata.update({
+                "filename": file_path.name
+            })
+        return splits, "Docling extraction completed successfully."
+    except Exception as e:
+        return None, f"Error during docling extraction: {str(e)}"
+# Step 1: File Upload & Index
+def process_file(file, extraction_method):
+    global vectorstore, retriever, rag_chain
+    if file is None:
+        return "Please upload a file first."
+    file_path = Path(file.name)
+    # Process based on selected extraction method
+    if extraction_method == "Direct Extraction":
+        data_splits, message = process_direct(file_path)
+    else:  # Docling Extraction
+        data_splits, message = process_docling(file_path)
+    if data_splits is None:
+        return message
+    # Create vector store
+    vectorstore = FAISS.from_documents(documents=data_splits, embedding=embedding)
+    # Create retriever and chain
+    retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})
+    question_answer_chain = create_stuff_documents_chain(llm, prompt=rag_prompt)
+    rag_chain = create_retrieval_chain(retriever, question_answer_chain)
+    # Count chunks created
+    chunk_count = len(data_splits)
+    return f"File processed using {extraction_method}. {message} Created {chunk_count} document chunks. You can now ask questions."
+# Format source information
+def format_sources(docs):
+    source_info = []
+    seen_sources = set()
+    for i, doc in enumerate(docs, 1):
+        # Extract source information
+        source = doc.metadata.get("source", "Unknown")
+        filename = doc.metadata.get("filename", Path(source).name if source != "Unknown" else "Unknown")
+        # Get header information if available (from docling extraction)
+        header_info = []
+        for level in range(1, 4):
+            header = doc.metadata.get(f"Header_{level}", "")
+            if header:
+                header_info.append(header)
+        # Create a unique identifier for this source
+        source_id = f"{filename}{'_' + '_'.join(header_info) if header_info else ''}"
+        if source_id not in seen_sources:
+            seen_sources.add(source_id)
+            # Format the source information
+            source_text = f"**Source {i}: {filename}**"
+            if header_info:
+                source_text += f"\n - Section: {' > '.join(header_info)}"
+            # Add the full text content without truncation
+            source_text += f"\n - Content: \"{doc.page_content}\""
+            source_info.append(source_text)
+    return "\n\n".join(source_info)
+# Step 2: Chat with RAG
+def chat_with_doc(query):
+    if not rag_chain:
+        return "Upload and process a document first.", ""
+    # Use the stored retriever directly to get documents
+    if not retriever:
+        return "Retriever not initialized. Please process a document first.", ""
+    # Get documents directly from retriever
+    retrieved_docs = retriever.get_relevant_documents(query)
+    # Then invoke the chain with the query
+    result = rag_chain.invoke({"input": query})
+    answer = result.get("answer", "No answer found.")
+    # Format source information
+    source_text = format_sources(retrieved_docs)
+    return answer, source_text
+# Gradio Interface
+with gr.Blocks() as app:
+    gr.Markdown("# Document Q&A System")
+    with gr.Tab("Upload Document"):
+        file_input = gr.File(label="Upload File", file_types=[".pdf", ".md", ".docx", ".txt"])
+        extraction_method = gr.Radio(
+            ["Direct Extraction", "Docling Extraction"],
+            label="Extraction Method",
+            value="Direct Extraction",
+            info="Direct extraction is faster but simpler. Docling extraction preserves document structure better."
+        )
+        process_btn = gr.Button("Process Document")
+        file_output = gr.Textbox(label="Processing Status", interactive=False)
+        process_btn.click(
+            fn=process_file,
+            inputs=[file_input, extraction_method],
+            outputs=file_output
+        )
+    with gr.Tab("Chat with Document"):
+        chat_input = gr.Textbox(label="Ask a Question", placeholder="Type your question...")
+        ask_btn = gr.Button("Ask")
+        chat_output = gr.Textbox(label="Answer", interactive=False)
+        sources_output = gr.Textbox(label="Sources", interactive=False)
+        ask_btn.click(
+            fn=chat_with_doc,
+            inputs=chat_input,
+            outputs=[chat_output, sources_output]
+        )
+        chat_input.submit(
+            fn=chat_with_doc,
+            inputs=chat_input,
+            outputs=[chat_output, sources_output]
+        )
 if __name__ == "__main__":
+    app.launch()

data_extraction.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import json
+import re
+from pathlib import Path
+class DataExtractor:
+    def __init__(self):
+        """
+        Initialize DataExtractor with robust logging and error handling
+        """
+        pass
+    def extract_text_from_pdf(self, pdf_path):
+        """
+        Robust PDF text extraction with enhanced error handling
+        Args:
+            pdf_path (str): Path to PDF file
+        Returns:
+            str: Extracted text or empty string
+        """
+        try:
+            import pymupdf
+            doc = pymupdf.open(pdf_path)
+            # Extract text with page-level error handling
+            full_text = []
+            for page_num, page in enumerate(doc, 1):
+                try:
+                    page_text = page.get_text()
+                    full_text.append(page_text)
+                except Exception as page_error:
+                    print(f"Text extraction failed for page {page_num} in {pdf_path}: {page_error}")
+            return ''.join(full_text)
+        except Exception as e:
+            print(f"PDF Extraction failed for {pdf_path}: {e}")
+            return ""
+    def extract_text_from_doc(self, doc_path):
+        """
+        Robust DOCX text extraction with comprehensive error handling
+        Args:
+            doc_path (str): Path to DOCX file
+        Returns:
+            str: Extracted text or empty string
+        """
+        try:
+            import docx2txt
+            return docx2txt.process(doc_path)
+        except Exception as e:
+            print(f"DOCX Extraction failed for {doc_path}: {e}")
+            return ""
+    def extract_text_from_txt(self, txt_path):
+        """
+        Robust TXT file text extraction
+        Args:
+            txt_path (str): Path to TXT file
+        Returns:
+            str: Extracted text or empty string
+        """
+        try:
+            with open(txt_path, "r", encoding="utf-8") as f:
+                return f.read()
+        except UnicodeDecodeError:
+            # Try alternative encodings
+            for encoding in ['latin-1', 'iso-8859-1', 'cp1252']:
+                try:
+                    with open(txt_path, "r", encoding=encoding) as f:
+                        return f.read()
+                except:
+                    continue
+        except Exception as e:
+            print(f"TXT Extraction failed for {txt_path}: {e}")
+            return ""
+    def clean_text(self, text):
+        """
+        Robust cleaner for government FAQ chatbot.
+        - Removes boilerplate (page numbers, headers/footers)
+        - Preserves layout for LLM tokenization
+        - Normalizes structure and character encoding
+        """
+        # Normalize line endings
+        text = text.replace('\r\n', '\n').replace('\r', '\n')
+        # Remove page headers/footers (common boilerplate)
+        text = re.sub(r'\n?Page\s*\d+(\s*of\s*\d+)?\n?', '\n', text, flags=re.IGNORECASE)
+        text = re.sub(r'\n?Copyright.*?\d{4}.*?\n?', '\n', text, flags=re.IGNORECASE)
+        text = re.sub(r'\n?All rights reserved.*?\n?', '\n', text, flags=re.IGNORECASE)
+        # Remove decorative dividers
+        text = re.sub(r'^\s*[-=_*]{3,}\s*$', '', text, flags=re.MULTILINE)
+        # Strip non-printable characters but keep structure
+        text = re.sub(r'[^\x09\x0A\x0D\x20-\x7E]', '', text)
+        # Normalize whitespace
+        text = re.sub(r'[ \t]+', ' ', text)             # inline
+        text = re.sub(r'[ \t]+\n', '\n', text)          # line ends
+        text = re.sub(r'\n{3,}', '\n\n', text)          # excessive newlines
+        return text.strip()
+    def extract_text_from_files(self, doc_paths):
+        """
+        Extract and process text from multiple document types
+        Returns:
+            list: Extracted document data with metadata
+        """
+        doc_paths = [
+            str(Path(path).resolve())
+            for path in doc_paths
+            if Path(path).exists()
+        ]
+        data = []
+        for path in doc_paths:
+            try:
+                ext = Path(path).suffix.lower()
+                # Text extraction based on file type
+                if ext == '.pdf':
+                    text = self.extract_text_from_pdf(path)
+                elif ext == '.docx':
+                    text = self.extract_text_from_doc(path)
+                elif ext == '.txt':
+                    text = self.extract_text_from_txt(path)
+                else:
+                    print(f"Unsupported format: {path}")
+                    continue
+                # Skip empty documents
+                if not text.strip():
+                    print(f"No text extracted from {path}")
+                    continue
+                # Clean and structure extracted text
+                cleaned_text = self.clean_text(text)
+                # Add document metadata
+                doc_data = {
+                    "title": Path(path).name,
+                    "path": path,
+                    "text": cleaned_text,
+                    "text_length": len(cleaned_text)
+                }
+                data.append(doc_data)
+            except Exception as e:
+                print(f"Unexpected error processing {path}: {e}")
+        return data
+    def pdf_to_markdown(self, pdf_path, markdown_path):
+        """
+        Convert PDF to Markdown using docling
+        Args:
+            pdf_path (Path): Path to PDF file
+            markdown_path (Path): Output path for markdown file
+        Returns:
+            str: Markdown content
+        """
+        try:
+            from docling.document_converter import DocumentConverter
+            # Define the source PDF file
+            source = pdf_path
+            converter = DocumentConverter()
+            # Convert the PDF to Markdown
+            result = converter.convert(source)
+            markdown = result.document.export_to_markdown()
+            # Write the Markdown output to a file
+            with open(markdown_path, "w", encoding="utf-8") as file:
+                file.write(markdown)
+            return markdown
+        except ImportError:
+            print("Docling is not installed. Please install it with: pip install docling")
+            return ""
+        except Exception as e:
+            print(f"Error converting PDF to Markdown: {e}")
+            return ""
+    def load_markdown_file(self, markdown_path):
+        """
+        Load markdown file using langchain_docling
+        Args:
+            markdown_path (Path): Path to markdown file
+        Returns:
+            list: List of documents
+        """
+        try:
+            from langchain_docling import DoclingLoader
+            from langchain_docling.loader import ExportType
+            loader = DoclingLoader(
+                file_path=markdown_path,
+                export_type=ExportType.MARKDOWN
+            )
+            data = loader.load()
+            return data
+        except ImportError:
+            print("langchain_docling is not installed. Please install it with: pip install langchain_docling")
+            return []
+        except Exception as e:
+            print(f"Error loading markdown file: {e}")
+            return []
+    def spit_markdown(self, markdown):
+        """
+        Split markdown using MarkdownHeaderTextSplitter
+        Args:
+            markdown (list): List of markdown documents
+        Returns:
+            list: List of split documents
+        """
+        try:
+            from langchain_text_splitters import MarkdownHeaderTextSplitter
+            splitter = MarkdownHeaderTextSplitter(
+                headers_to_split_on=[
+                    ("#", "Header_1"),
+                    ("##", "Header_2"),
+                    ("###", "Header_3"),
+                ],
+            )
+            splits = []
+            for doc in markdown:
+                for split in splitter.split_text(doc.page_content):
+                    # Include metadata and page_content from the original document
+                    split.metadata.update({
+                        "Header_1": split.metadata.get("Header_1", ""),
+                        "Header_2": split.metadata.get("Header_2", ""),
+                        "Header_3": split.metadata.get("Header_3", "")
+                    })
+                    splits.append(split)
+            return splits
+        except ImportError:
+            print("langchain_text_splitters is not installed or has compatibility issues.")
+            return []
+        except Exception as e:
+            print(f"Error splitting markdown: {e}")
+            return []