Spaces:

Praneethaneelapareddigari
/

visual-rag

Sleeping

App Files Files Community

Praneetha N commited on 6 days ago

Commit

e05fa86

0 Parent(s):

Initial clean commit

Browse files

Files changed (6) hide show

.gitattributes +3 -0
.gitignore +16 -0
README.md +0 -0
app.py +151 -0
rag_pipeline.py +272 -0
requirements.txt +17 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,3 @@

+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+.DS_Store
+__pycache__/
+*.pyc
+.venv/
+venv/
+models/
+screenshots/
+data/*.pdf
+*.pdf
+*.png
+*.jpg
+*.jpeg
+*.bin
+*.safetensors
+*.onnx
+*.pt

README.md ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# app.py
+import os
+import json
+import time
+import tempfile
+import streamlit as st
+from doc_loader import load_document
+from figure_extractor import extract_figures
+from rag_pipeline import (
+    build_rag_pipeline,
+    query_rag_full,
+    evaluate_rag,
+)
+st.set_page_config(page_title="📑 Visual Document RAG", layout="wide")
+st.title("📑 Visual Document RAG ")
+domain = st.selectbox(
+    "Domain focus",
+    ["Finance", "Healthcare", "Law", "Education", "Multimodal"],
+    index=0,
+    help="Used to lightly steer retrieval/answers",
+)
+uploaded_file = st.file_uploader("📂 Upload a PDF or Image", type=["pdf", "png", "jpg"])
+query = st.text_input("🔎 Ask a question about the document:")
+summarize = st.checkbox("📝 Summarize the whole document")
+if uploaded_file:
+    # persist to temp path for libs
+    with tempfile.NamedTemporaryFile(
+        delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}"
+    ) as tmp:
+        tmp.write(uploaded_file.read())
+        tmp_path = tmp.name
+    with st.spinner("Processing document..."):
+        docs_text, sections = load_document(tmp_path, return_sections=True)
+    # Optional figure extraction (PDF only)
+    figures_meta = []
+    extra_docs = []
+    if uploaded_file.type == "application/pdf":
+        try:
+            figures_meta = extract_figures(tmp_path, out_dir="figures", lang="eng") or []
+            if figures_meta:
+                sections["Figures (OCR+captions)"] = "\n\n".join(
+                    [
+                        f"p.{f['page']} — {f.get('caption') or '(no caption)'}\n"
+                        f"{(f.get('ocr_text') or '')[:200]}"
+                        for f in figures_meta
+                    ]
+                )
+                # vectorizable figure docs
+                for f in figures_meta:
+                    content = (
+                        f"FIGURE p.{f['page']}: {f.get('caption') or ''}\n"
+                        f"OCR: {f.get('ocr_text') or ''}\n"
+                        f"TAGS: {' '.join(f.get('tags', []))}"
+                    ).strip()
+                    metadata = {
+                        "type": "figure",
+                        "page": f.get("page"),
+                        "path": f.get("path"),
+                        "caption": f.get("caption"),
+                        "tags": f.get("tags", []),
+                    }
+                    extra_docs.append({"content": content, "metadata": metadata})
+        except Exception as e:
+            st.warning(f"Figure extraction skipped: {e}")
+    # Build vector index (now indexes figures too)
+    if docs_text.strip():
+        db = build_rag_pipeline(docs_text, extra_docs=extra_docs or None)
+        st.success(f"✅ Document indexed! (Domain: {domain})")
+        # Show extracted sections
+        with st.expander("📂 Extracted Document Content"):
+            tab_names = list(sections.keys())
+            tabs = st.tabs(tab_names)
+            for i, name in enumerate(tab_names):
+                with tabs[i]:
+                    st.text_area(f"{name}", sections[name], height=230)
+        # Actions (Summarize or Q&A)
+        answer_text, retrieved_docs, latency = None, None, None
+        colA, colB = st.columns([1, 1])
+        with colA:
+            if summarize:
+                if st.button("📝 Summarize Document"):
+                    start = time.time()
+                    q = f"[Domain: {domain}] Summarize this document briefly with key points and numbers."
+                    answer_text, _, retrieved_docs = query_rag_full(
+                        db, q, domain=domain
+                    )
+                    latency = round(time.time() - start, 3)
+            else:
+                if query and st.button("💡 Get Answer"):
+                    start = time.time()
+                    q = f"[Domain: {domain}] {query}"
+                    answer_text, _, retrieved_docs = query_rag_full(
+                        db, q, domain=domain
+                    )
+                    latency = round(time.time() - start, 3)
+        # Show results
+        if answer_text is not None:
+            st.subheader("💡 Answer")
+            st.write(answer_text)
+            st.caption(f"⏱️ Latency: {latency}s")
+            with st.expander("🔍 Retrieved Contexts"):
+                if retrieved_docs:
+                    for i, d in enumerate(retrieved_docs, 1):
+                        meta = getattr(d, "metadata", {}) or {}
+                        if meta.get("type") == "figure" and meta.get("path"):
+                            st.write(
+                                f"Figure (page {meta.get('page')}): "
+                                f"{meta.get('caption') or '(no caption)'}"
+                            )
+                            st.image(meta["path"], use_container_width=True)
+                        else:
+                            st.info(f"Chunk {i}:\n\n{d.page_content}")
+            # Evaluation
+            st.markdown("---")
+            st.subheader("📊 Evaluation")
+            if st.button("Evaluate (LLM-based: Faithfulness & Relevancy)"):
+                try:
+                    raw = evaluate_rag(
+                        answer_text,
+                        [d.page_content for d in (retrieved_docs or [])],
+                        f"[Domain: {domain}] {query or 'Summary'}",
+                    )
+                    try:
+                        payload = json.loads(raw) if isinstance(raw, str) else raw
+                    except Exception:
+                        payload = {"raw": raw}
+                    st.json(payload)
+                    st.download_button(
+                        "⬇️ Download Evaluation JSON",
+                        data=json.dumps(payload, indent=2),
+                        file_name="evaluation.json",
+                        mime="application/json",
+                    )
+                except Exception as e:
+                    st.warning(f"Evaluation unavailable: {e}")
+    else:
+        st.error("❌ No text could be extracted from the uploaded file.")

rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,272 @@

+# rag_pipeline.py
+import os
+from pathlib import Path
+from typing import List, Tuple, Optional
+from langchain_community.vectorstores import DocArrayInMemorySearch
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.schema import Document
+# Prefer FAISS if available; otherwise use a pure-Python fallback (no native build)
+try:
+    from langchain_community.vectorstores import FAISS
+    _HAS_FAISS = True
+except Exception:
+    _HAS_FAISS = False
+    from langchain_community.vectorstores import DocArrayInMemorySearch
+# Embeddings + LLM
+try:
+    from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+    _HAS_OPENAI = True
+except Exception:
+    _HAS_OPENAI = False
+try:
+    from langchain_community.embeddings import HuggingFaceEmbeddings
+    _HAS_HF = True
+except Exception:
+    _HAS_HF = False
+# Optional cross-encoder re-ranker
+try:
+    from sentence_transformers import CrossEncoder
+    _HAS_RERANK = True
+except Exception:
+    _HAS_RERANK = False
+def _has_openai_key() -> bool:
+    return bool(os.getenv("OPENAI_API_KEY"))
+def _hf_offline() -> bool:
+    # Either var set to "1"/"true" or network really blocked—honor explicit offline flags.
+    return str(os.getenv("HF_HUB_OFFLINE", "")).strip() not in ("", "0", "false") or \
+           str(os.getenv("TRANSFORMERS_OFFLINE", "")).strip() not in ("", "0", "false")
+def _resolve_local_dir(env_var: str, default_subdir: str) -> Optional[str]:
+    """
+    Return an absolute path to a local model dir if it exists, else None.
+    env_var (e.g., EMB_LOCAL_DIR / RERANK_LOCAL_DIR) takes priority.
+    default_subdir is relative to this file's directory (e.g., 'models/<name>').
+    """
+    # 1) explicit env
+    p = os.getenv(env_var)
+    if p and Path(p).is_dir():
+        return str(Path(p).resolve())
+    # 2) project-relative
+    here = Path(__file__).parent
+    candidate = here / default_subdir
+    if candidate.is_dir():
+        return str(candidate.resolve())
+    return None
+def _get_embeddings():
+    """
+    Prefer OpenAI embeddings if available; otherwise use HuggingFace with a local directory if present.
+    If offline and no local dir, raise a friendly error.
+    """
+    if _HAS_OPENAI and _has_openai_key():
+        return OpenAIEmbeddings(model="text-embedding-3-small")
+    if not _HAS_HF:
+        raise RuntimeError(
+            "No embeddings backend available. Install `langchain-openai` (and set OPENAI_API_KEY) "
+            "or install `langchain_community` + `sentence-transformers`."
+        )
+    # Try local first
+    local_dir = _resolve_local_dir(
+        env_var="EMB_LOCAL_DIR",
+        default_subdir="models/paraphrase-MiniLM-L6-v2",
+    )
+    if local_dir:
+        return HuggingFaceEmbeddings(model_name=local_dir)
+    # No local dir—fall back to hub name only if not offline
+    if _hf_offline():
+        raise RuntimeError(
+            "HF offline mode is enabled and no local embedding model was found.\n"
+            "Set EMB_LOCAL_DIR to a downloaded folder, or place the model at "
+            "<repo_root>/models/paraphrase-MiniLM-L6-v2.\n"
+            "Example download (online machine):\n"
+            "  hf download sentence-transformers/paraphrase-MiniLM-L6-v2 "
+            "--local-dir models/paraphrase-MiniLM-L6-v2"
+        )
+    return HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2")
+def _get_llm(model_name: str = "gpt-4o-mini", temperature: float = 0):
+    if _HAS_OPENAI and _has_openai_key():
+        return ChatOpenAI(model=model_name, temperature=temperature)
+    return None
+def build_rag_pipeline(
+    docs: str,
+    extra_docs: Optional[List[dict]] = None,
+    chunk_size: int = 1000,
+    chunk_overlap: int = 120,
+):
+    """
+    Build a vector index with metadata-aware Documents.
+    Falls back to DocArrayInMemorySearch when FAISS isn't available.
+    - docs: merged plain text from loader
+    - extra_docs: list of {"content": str, "metadata": dict} for figures, etc.
+    """
+    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    text_chunks = splitter.split_text(docs) if docs else []
+    documents: List[Document] = []
+    for i, ch in enumerate(text_chunks):
+        documents.append(
+            Document(page_content=ch, metadata={"type": "text", "chunk_id": i})
+        )
+    if extra_docs:
+        for ed in extra_docs:
+            documents.append(
+                Document(
+                    page_content=(ed.get("content") or "").strip(),
+                    metadata={**(ed.get("metadata") or {}), "type": (ed.get("metadata") or {}).get("type", "extra")}
+                )
+            )
+    if not documents:
+        raise ValueError("No content to index.")
+    embeddings = _get_embeddings()
+    return DocArrayInMemorySearch.from_documents(documents, embeddings)
+def _domain_prompt(domain: str) -> str:
+    base = (
+        "You are an AI assistant specialized in {domain}. "
+        "Answer strictly using the provided context (including tables/figures). "
+        "Provide clear numbers and cite section/table/figure if possible. "
+        'If the answer is not in the context, reply exactly: "I don\'t have enough information from the document."'
+    )
+    return base.format(domain=domain)
+def query_rag_full(
+    db,
+    query: str,
+    top_k: int = 12,
+    rerank_keep: int = 5,
+    domain: str = "Finance",
+) -> Tuple[str, List[str], List[Document]]:
+    """
+    Returns (answer_text, retrieved_texts, retrieved_docs)
+    - Retrieves Documents with metadata
+    - Optional cross-encoder re-ranking (local-only if offline)
+    - LLM synthesis if available, else stitched fallback
+    """
+    retriever = db.as_retriever(search_kwargs={"k": top_k})
+    retrieved_docs: List[Document] = retriever.get_relevant_documents(query) or []
+    # Optional re-rank
+    top_docs = retrieved_docs
+    if _HAS_RERANK and retrieved_docs:
+        rerank_local = _resolve_local_dir("RERANK_LOCAL_DIR", "models/msmarco-MiniLM-L-6-v2")
+        try:
+            if rerank_local:
+                model = CrossEncoder(rerank_local)
+            elif not _hf_offline():
+                model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
+            else:
+                model = None  # offline + no local reranker → skip
+        except Exception:
+            model = None
+        if model is not None:
+            pairs = [[query, d.page_content] for d in retrieved_docs]
+            scores = model.predict(pairs)
+            idx_sorted = sorted(range(len(scores)), key=lambda i: float(scores[i]), reverse=True)
+            keep = max(1, min(rerank_keep, len(idx_sorted)))
+            top_docs = [retrieved_docs[i] for i in idx_sorted[:keep]]
+    retrieved_texts = [d.page_content for d in top_docs]
+    if not retrieved_texts:
+        return "I couldn't find anything relevant in the document.", [], []
+    llm = _get_llm()
+    sys = _domain_prompt(domain)
+    if llm:
+        context = "\n\n".join([f"[{i+1}] {d.page_content[:4000]}" for i, d in enumerate(top_docs)])
+        cite_hints = []
+        for i, d in enumerate(top_docs):
+            m = d.metadata or {}
+            if m.get("type") == "figure" and m.get("page"):
+                cite_hints.append(f"[{i+1}] Figure p.{m.get('page')}")
+            elif m.get("type") == "text":
+                cite_hints.append(f"[{i+1}] Text chunk {m.get('chunk_id')}")
+        hints = "; ".join(cite_hints)
+        prompt = f"""{sys}
+Context:
+{context}
+Hints for citations: {hints}
+Question: {query}
+Answer (include brief citations like [1] or 'Figure p.X' when appropriate):"""
+        answer = llm.invoke(prompt).content.strip()
+        return answer, retrieved_texts, top_docs
+    # Offline fallback
+    stitched = " ".join(retrieved_texts)[:1500]
+    answer = f"Answer (from retrieved context): {stitched}"
+    return answer, retrieved_texts, top_docs
+def query_rag(db, query: str, top_k: int = 4) -> Tuple[str, List[str]]:
+    ans, texts, _docs = query_rag_full(db, query, top_k=top_k)
+    return ans, texts
+def evaluate_rag(answer: str, retrieved_docs: List[str], query: str):
+    llm = _get_llm(model_name="gpt-4o-mini", temperature=0)
+    if not llm:
+        return {
+            "faithfulness": None,
+            "relevancy": None,
+            "explanation": (
+                "Evaluation requires an LLM (OpenAI). Set OPENAI_API_KEY and install `langchain-openai`."
+            ),
+            "mode": "offline-fallback",
+        }
+    docs_text = "\n".join(retrieved_docs)
+    eval_prompt = f"""
+You are an impartial evaluator. Given a question, an assistant's answer, and the retrieved context,
+score the response on:
+1) Faithfulness (0-5): Is every claim supported by the retrieved context?
+2) Relevancy (0-5): Do the retrieved docs directly address the question?
+Return STRICT JSON ONLY:
+{{
+  "faithfulness": <0-5 integer>,
+  "relevancy": <0-5 integer>,
+  "explanation": "<one-sentence reason>"
+}}
+---
+Question: {query}
+Retrieved Context:
+{docs_text}
+Answer:
+{answer}
+"""
+    raw = llm.invoke(eval_prompt).content.strip()
+    return raw

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+streamlit
+langchain
+langchain-openai
+langchain-community
+langchain-text-splitters
+faiss-cpu
+pymupdf
+pdfplumber
+camelot-py
+opencv-python
+pillow
+pytesseract
+sentence-transformers
+huggingface_hub
+ragas
+datasets
+evaluate