Spaces:

Praneethaneelapareddigari
/

visual-rag

Sleeping

App Files Files Community

Praneetha N commited on 11 days ago

Commit

1e5cf81

1 Parent(s): e05fa86

Add supporting scripts for RAG pipeline

Browse files

Files changed (5) hide show

dataset_eval.py +81 -0
doc_loader.py +151 -0
figure_extractor.py +94 -0
ragas_eval.py +60 -0
utils.py +213 -0

dataset_eval.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# dataset_eval.py
+import os, json, glob, time
+from datetime import datetime
+import pandas as pd
+from datasets import Dataset
+from ragas import evaluate
+from ragas.metrics import faithfulness, answer_relevance, context_recall
+from doc_loader import load_document
+from rag_pipeline import build_rag_pipeline, query_rag_full
+DATA_DIR = "datasets/finance"
+OUTPUT = "eval_runs"
+os.makedirs(OUTPUT, exist_ok=True)
+def load_questions(q_path):
+    with open(q_path, "r") as f:
+        data = json.load(f)
+    return data["questions"], data.get("ground_truth", [""] * len(data["questions"]))
+def eval_file(pdf_path, q_path, domain="Finance"):
+    docs, _ = load_document(pdf_path, return_sections=True)
+    db = build_rag_pipeline(docs)
+    questions, golds = load_questions(q_path)
+    answers, contexts, latencies = [], [], []
+    for q in questions:
+        t0 = time.time()
+        ans, ctxs, _docs = query_rag_full(db, q, domain=domain)
+        latencies.append(round(time.time() - t0, 3))
+        answers.append(ans)
+        contexts.append([c for c in ctxs])  # ragas expects List[List[str]]
+    ds = Dataset.from_dict({
+        "question": questions,
+        "contexts": contexts,
+        "answer": answers,
+        "ground_truth": golds,
+    })
+    scores = evaluate(ds, metrics=[faithfulness, answer_relevance, context_recall])
+    try:
+        recs = scores.to_pandas().to_dict(orient="records")
+    except Exception:
+        recs = [scores] if isinstance(scores, dict) else [{"scores_raw": str(scores)}]
+    for i, r in enumerate(recs):
+        r["latency_s"] = latencies[i] if i < len(latencies) else None
+    return questions, answers, golds, recs
+def main():
+    rows = []
+    for pdf_path in glob.glob(os.path.join(DATA_DIR, "*.pdf")):
+        base = os.path.splitext(os.path.basename(pdf_path))[0]
+        q_path = os.path.join(DATA_DIR, f"{base}.questions.json")
+        if not os.path.exists(q_path):
+            print(f"[WARN] Missing questions for {pdf_path} (expected {q_path}) — skipping.")
+            continue
+        print(f"[INFO] Evaluating {base} ...")
+        qs, ans, gold, recs = eval_file(pdf_path, q_path)
+        for i, r in enumerate(recs):
+            r.update({
+                "file": base,
+                "question": qs[i],
+                "answer": ans[i],
+                "ground_truth": gold[i],
+            })
+            rows.append(r)
+    if not rows:
+        print("[INFO] No evaluations produced.")
+        return
+    df = pd.DataFrame.from_records(rows)
+    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    out_csv = os.path.join(OUTPUT, f"aggregate_{stamp}.csv")
+    df.to_csv(out_csv, index=False)
+    print(f"[OK] Saved aggregate CSV → {out_csv}")
+if __name__ == "__main__":
+    main()

doc_loader.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# doc_loader.py
+import os
+import io
+from typing import Dict, Tuple
+import fitz  # PyMuPDF
+import pdfplumber
+import camelot
+import pytesseract
+from PIL import Image
+def _extract_text_digital_pdf(path: str) -> str:
+    parts = []
+    try:
+        with fitz.open(path) as pdf:
+            for page in pdf:
+                txt = page.get_text("text") or ""
+                if txt.strip():
+                    parts.append(txt.strip())
+    except Exception:
+        return ""
+    return "\n\n".join(parts).strip()
+def _extract_text_scanned_pdf(path: str, zoom: float = 2.0, lang: str = "eng") -> str:
+    out = []
+    try:
+        with fitz.open(path) as pdf:
+            mat = fitz.Matrix(zoom, zoom)
+            for page in pdf:
+                pix = page.get_pixmap(matrix=mat)
+                img = Image.open(io.BytesIO(pix.tobytes("png")))
+                ocr_text = pytesseract.image_to_string(img, lang=lang) or ""
+                if ocr_text.strip():
+                    out.append(ocr_text.strip())
+    except Exception:
+        return ""
+    return "\n\n".join(out).strip()
+def extract_text_from_pdf(path: str, lang: str = "eng") -> str:
+    digital = _extract_text_digital_pdf(path)
+    if len(digital) >= 200:
+        return digital
+    return _extract_text_scanned_pdf(path, zoom=2.0, lang=lang)
+def extract_tables_from_pdf(path: str) -> str:
+    # Camelot first (lattice → stream)
+    try:
+        blocks = []
+        for flavor in ("lattice", "stream"):
+            try:
+                tbs = camelot.read_pdf(path, pages="all", flavor=flavor)
+                if tbs and len(tbs) > 0:
+                    for i, tb in enumerate(tbs):
+                        rows = [" | ".join(map(str, row)) for row in tb.df.values.tolist()]
+                        blocks.append(f"Table ({flavor}) {i+1}:\n" + "\n".join(rows))
+                    break
+            except Exception:
+                continue
+        if blocks:
+            return "\n\n".join(blocks).strip()
+    except Exception:
+        pass
+    # Fallback to pdfplumber
+    try:
+        blocks = []
+        with pdfplumber.open(path) as pdf:
+            for pageno, page in enumerate(pdf.pages, start=1):
+                tables = page.extract_tables() or []
+                for t_i, table in enumerate(tables, start=1):
+                    rows = [" | ".join([c if c is not None else "" for c in row]) for row in table]
+                    blocks.append(f"Table (plumber) p.{pageno} #{t_i}:\n" + "\n".join(rows))
+        return "\n\n".join(blocks).strip()
+    except Exception:
+        return ""
+def extract_layout_text_pdf(path: str) -> str:
+    try:
+        parts = []
+        with pdfplumber.open(path) as pdf:
+            for page in pdf.pages:
+                txt = page.extract_text(x_tolerance=2, y_tolerance=2) or ""
+                if txt.strip():
+                    parts.append(txt.strip())
+        return "\n\n".join(parts).strip()
+    except Exception:
+        return ""
+def extract_figures_and_captions_pdf(path: str) -> str:
+    try:
+        out = []
+        with pdfplumber.open(path) as pdf:
+            for pageno, page in enumerate(pdf.pages, start=1):
+                images = page.images or []
+                for idx, im in enumerate(images, start=1):
+                    x0, y0, x1, y1 = float(im.get("x0", 0)), float(im.get("top", 0)), float(im.get("x1", 0)), float(im.get("bottom", 0))
+                    ph = float(page.height)
+                    band_top = min(y1 + 5, ph)
+                    band_bottom = min(y1 + 60, ph)
+                    cap = ""
+                    try:
+                        band = page.within_bbox((x0, band_top, x1, band_bottom))
+                        cap = (band.extract_text() or "").strip()
+                    except Exception:
+                        pass
+                    out.append(f"Figure p.{pageno} #{idx} bbox=({int(x0)},{int(y0)},{int(x1)},{int(y1)})\nCaption: {cap or 'N/A'}")
+        return "\n\n".join(out).strip()
+    except Exception:
+        return ""
+def extract_text_from_image(path: str, lang: str = "eng") -> str:
+    try:
+        img = Image.open(path)
+        return (pytesseract.image_to_string(img, lang=lang) or "").strip()
+    except Exception:
+        return ""
+def load_document(path: str, return_sections: bool = False, lang: str = "eng"):
+    ext = os.path.splitext(path)[-1].lower()
+    sections: Dict[str, str] = {}
+    if ext == ".pdf":
+        sections["Text"] = extract_text_from_pdf(path, lang=lang)
+        tbl = extract_tables_from_pdf(path)
+        if tbl:
+            sections["Tables"] = tbl
+        layout = extract_layout_text_pdf(path)
+        if layout:
+            sections["Layout"] = layout
+        figs = extract_figures_and_captions_pdf(path)
+        if figs:
+            sections["Figures"] = figs
+    elif ext in [".jpg", ".jpeg", ".png"]:
+        sections["OCR"] = extract_text_from_image(path, lang=lang)
+    else:
+        raise ValueError(f"Unsupported file type: {ext}")
+    merged = "\n\n".join([v for v in sections.values() if v and v.strip()]).strip()
+    if return_sections:
+        return merged, sections
+    return merged

figure_extractor.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# figure_extractor.py
+# figure_extractor.py
+import os
+import io
+from typing import List, Dict, Any
+import fitz  # PyMuPDF
+import pdfplumber
+import pytesseract
+from PIL import Image
+def extract_figures(pdf_path: str, out_dir: str = "figures", lang: str = "eng") -> List[Dict[str, Any]]:
+    """
+    Detect bitmap figures in a PDF, crop them, OCR content, and read a caption band.
+    Returns a list with metadata used for indexing + UI previews.
+    """
+    os.makedirs(out_dir, exist_ok=True)
+    results: List[Dict[str, Any]] = []
+    with pdfplumber.open(pdf_path) as pl_doc:
+        fz_doc = fitz.open(pdf_path)
+        for pageno in range(len(fz_doc)):
+            pl_page = pl_doc.pages[pageno]
+            fz_page = fz_doc[pageno]
+            images = pl_page.images or []
+            if not images:
+                continue
+            page_h = float(pl_page.height)
+            page_w = float(pl_page.width)
+            for idx, im in enumerate(images, start=1):
+                x0 = float(im.get("x0", 0))
+                y0 = float(im.get("top", 0))
+                x1 = float(im.get("x1", 0))
+                y1 = float(im.get("bottom", 0))
+                w = max(1.0, x1 - x0)
+                h = max(1.0, y1 - y0)
+                area = w * h
+                page_area = page_w * page_h
+                if area < 10000 or area < 0.01 * page_area:
+                    continue  # skip tiny artifacts/icons
+                try:
+                    rect = fitz.Rect(x0, y0, x1, y1)
+                    pix = fz_page.get_pixmap(clip=rect, alpha=False)
+                    img_path = os.path.join(out_dir, f"page_{pageno+1}_fig_{idx}.png")
+                    pix.save(img_path)
+                except Exception:
+                    continue
+                ocr_text = ""
+                try:
+                    with open(img_path, "rb") as fh:
+                        img = Image.open(io.BytesIO(fh.read()))
+                        ocr_text = (pytesseract.image_to_string(img, lang=lang) or "").strip()
+                except Exception:
+                    pass
+                caption = ""
+                try:
+                    band_top = min(y1 + 5, page_h)
+                    band_bottom = min(y1 + 60, page_h)
+                    if band_bottom > band_top:
+                        band = pl_page.within_bbox((x0, band_top, x1, band_bottom))
+                        caption = (band.extract_text() or "").strip()
+                except Exception:
+                    pass
+                tags = ["figure"]
+                low = (ocr_text + " " + caption).lower()
+                if any(k in low for k in ["chart", "graph", "trend", "bar", "line", "pie"]):
+                    tags.append("chart")
+                if any(k in low for k in [
+                    "revenue", "profit", "income", "eps", "cash flow",
+                    "operating", "balance", "assets", "liabilities", "equity",
+                    "ratio", "margin", "ebit", "ebitda"
+                ]):
+                    tags.append("finance")
+                results.append({
+                    "page": pageno + 1,
+                    "bbox": (int(x0), int(y0), int(x1), int(y1)),
+                    "path": img_path,
+                    "caption": caption,
+                    "ocr_text": ocr_text,
+                    "tags": tags,
+                })
+    return results

ragas_eval.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# ragas_eval.py
+# Convenience alias to run a single sample quickly.
+import os, json, time
+from datetime import datetime
+import pandas as pd
+from datasets import Dataset
+from ragas import evaluate
+from ragas.metrics import faithfulness, answer_relevance, context_recall
+from doc_loader import load_document
+from rag_pipeline import build_rag_pipeline, query_rag_full
+DOC_PATH = "samples/finance_report.pdf"
+OUTPUT_DIR = "eval_runs"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+def run():
+    docs, sections = load_document(DOC_PATH, return_sections=True)
+    db = build_rag_pipeline(docs)
+    questions = [
+        "What was the company’s net profit in 2022?",
+        "What is the EPS reported for Q3 2023?",
+        "Summarize the auditor’s opinion in one sentence.",
+    ]
+    gold = ["", "", ""]  # fill if known
+    answers, contexts, latencies = [], [], []
+    for q in questions:
+        t0 = time.time()
+        ans, ctxs, _ = query_rag_full(db, q, domain="Finance")
+        lat = round(time.time() - t0, 3)
+        answers.append(ans)
+        contexts.append(ctxs)
+        latencies.append(lat)
+    ds = Dataset.from_dict({
+        "question": questions,
+        "contexts": [list(c) for c in contexts],
+        "answer": answers,
+        "ground_truth": gold,
+    })
+    scores = evaluate(ds, metrics=[faithfulness, answer_relevance, context_recall])
+    # persist
+    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    out_json = os.path.join(OUTPUT_DIR, f"ragas_{stamp}.json")
+    with open(out_json, "w") as f:
+        json.dump({
+            "doc_path": DOC_PATH,
+            "questions": questions,
+            "answers": answers,
+            "latencies": latencies,
+            "scores": getattr(scores, "to_dict", lambda: str(scores))(),
+        }, f, indent=2)
+    print(f"[OK] Saved → {out_json}")
+if __name__ == "__main__":
+    run()

utils.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# utils.py
+from __future__ import annotations
+import io
+import os
+import tempfile
+from typing import Tuple, Optional, Union
+import cv2
+import numpy as np
+import pytesseract
+from PIL import Image
+from PyPDF2 import PdfReader
+# -------------------------------
+# Small helpers
+# -------------------------------
+def _to_path(file_or_path: Union[str, bytes, os.PathLike, io.BufferedIOBase]) -> Tuple[str, Optional[str]]:
+    """
+    Ensure we have a filesystem path. If a file-like is provided, write it to a temp file.
+    Returns (path, tmp_path); tmp_path is None if no temp file was created.
+    """
+    if isinstance(file_or_path, (str, bytes, os.PathLike)):
+        return str(file_or_path), None
+    # file-like → persist to a temp file
+    suffix = ""
+    try:
+        name = getattr(file_or_path, "name", "")
+        if isinstance(name, str) and "." in name:
+            suffix = f".{name.rsplit('.', 1)[-1]}"
+    except Exception:
+        pass
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
+    try:
+        # rewind if possible
+        if hasattr(file_or_path, "seek"):
+            try: file_or_path.seek(0)
+            except Exception: pass
+        tmp.write(file_or_path.read())
+    finally:
+        tmp.flush()
+        tmp.close()
+    return tmp.name, tmp.name
+def _cleanup_tmp(tmp_path: Optional[str]) -> None:
+    if tmp_path and os.path.exists(tmp_path):
+        try:
+            os.remove(tmp_path)
+        except Exception:
+            pass
+# -------------------------------
+# PDF Text Extraction (PyPDF2 fast path)
+# -------------------------------
+def extract_text_from_pdf(file_or_path) -> str:
+    """
+    Extract plain text from a (digital) PDF using PyPDF2.
+    Silent fallback (returns "") on failure to avoid polluting embeddings.
+    """
+    path, tmp = _to_path(file_or_path)
+    try:
+        reader = PdfReader(path)
+        pages_text = []
+        for page in reader.pages:
+            try:
+                t = page.extract_text()
+                if t:
+                    pages_text.append(t)
+            except Exception:
+                continue
+        return "\n".join(pages_text).strip()
+    except Exception:
+        return ""
+    finally:
+        _cleanup_tmp(tmp)
+# -------------------------------
+# Image OCR
+# -------------------------------
+def extract_text_from_image(file_or_path, lang: str = "eng") -> str:
+    """
+    Basic OCR on an image (jpg/png). Handles file path or file-like.
+    """
+    path, tmp = _to_path(file_or_path)
+    try:
+        img = Image.open(path)
+        return (pytesseract.image_to_string(img, lang=lang) or "").strip()
+    except Exception:
+        return ""
+    finally:
+        _cleanup_tmp(tmp)
+# -------------------------------
+# Tables from PDF (Camelot stream)
+# -------------------------------
+def extract_tables_from_pdf(file_path: str) -> str:
+    """
+    Extract tables using Camelot (stream flavor).
+    Returns a single plain text block.
+    """
+    try:
+        import camelot
+        tables = camelot.read_pdf(file_path, pages="all", flavor="stream")
+        extracted = []
+        for i, tb in enumerate(tables):
+            rows = [" | ".join(map(str, row)) for row in tb.df.values.tolist()]
+            extracted.append(f"Table {i+1}:\n" + "\n".join(rows))
+        return "\n\n".join(extracted).strip() if extracted else ""
+    except Exception:
+        return ""
+# -------------------------------
+# Layout-aware text (pdfplumber)
+# -------------------------------
+def extract_layout_text(file_path: str) -> str:
+    """
+    Preserve headings/paragraphs via pdfplumber tolerances.
+    """
+    try:
+        import pdfplumber
+        parts = []
+        with pdfplumber.open(file_path) as pdf:
+            for page in pdf.pages:
+                txt = page.extract_text(x_tolerance=2, y_tolerance=2) or ""
+                if txt.strip():
+                    parts.append(txt.strip())
+        return "\n\n".join(parts).strip()
+    except Exception:
+        return ""
+# -------------------------------
+# Chart / Graph OCR (OpenCV + Tesseract)
+# -------------------------------
+def extract_chart_text(image_input: Union[str, np.ndarray], lang: str = "eng") -> str:
+    """
+    Extract textual info from charts/graphs using a robust preprocessing pipeline:
+      - grayscale
+      - morphological tophat (remove background)
+      - adaptive threshold (handles light/dark themes)
+      - median denoise
+      - OCR with conservative psm
+    Accepts a file path or a BGR numpy array (OpenCV).
+    """
+    try:
+        if isinstance(image_input, str):
+            img = cv2.imread(image_input)
+        else:
+            img = image_input
+        if img is None:
+            return ""
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        # background suppression (tophat)
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9, 9))
+        tophat = cv2.morphologyEx(gray, cv2.MORPH_TOPHAT, kernel)
+        # adaptive threshold (robust to varying backgrounds)
+        thr = cv2.adaptiveThreshold(
+            tophat, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY, 31, 15
+        )
+        # slight opening to remove specks, then median blur
+        opened = cv2.morphologyEx(thr, cv2.MORPH_OPEN, np.ones((2, 2), np.uint8))
+        denoised = cv2.medianBlur(opened, 3)
+        # Some charts have light text on dark bg → try inverted too and pick longer text
+        inverted = cv2.bitwise_not(denoised)
+        cfg = "--oem 3 --psm 6"  # assume a block of text
+        txt1 = pytesseract.image_to_string(denoised, lang=lang, config=config_str(cfg))
+        txt2 = pytesseract.image_to_string(inverted, lang=lang, config=config_str(cfg))
+        text = (txt1 or "")
+        if len((txt2 or "").strip()) > len(text.strip()):
+            text = txt2
+        return text.strip()
+    except Exception:
+        return ""
+def config_str(base: str) -> str:
+    """
+    Helper to make it obvious where to append tesseract configs later.
+    """
+    return base
+# -------------------------------
+# Image-Text correlation helper
+# -------------------------------
+def merge_image_with_caption(image_text: str, caption: str) -> str:
+    """
+    Combine OCR text + caption into a single blob for embedding.
+    """
+    image_text = (image_text or "").strip() or "No visible text"
+    caption = (caption or "").strip() or "No caption"
+    return f"Image Content: {image_text}\nCaption: {caption}"