Praneetha N
commited on
Commit
·
1e5cf81
1
Parent(s):
e05fa86
Add supporting scripts for RAG pipeline
Browse files- dataset_eval.py +81 -0
- doc_loader.py +151 -0
- figure_extractor.py +94 -0
- ragas_eval.py +60 -0
- utils.py +213 -0
dataset_eval.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset_eval.py
|
2 |
+
import os, json, glob, time
|
3 |
+
from datetime import datetime
|
4 |
+
import pandas as pd
|
5 |
+
from datasets import Dataset
|
6 |
+
from ragas import evaluate
|
7 |
+
from ragas.metrics import faithfulness, answer_relevance, context_recall
|
8 |
+
|
9 |
+
from doc_loader import load_document
|
10 |
+
from rag_pipeline import build_rag_pipeline, query_rag_full
|
11 |
+
|
12 |
+
DATA_DIR = "datasets/finance"
|
13 |
+
OUTPUT = "eval_runs"
|
14 |
+
os.makedirs(OUTPUT, exist_ok=True)
|
15 |
+
|
16 |
+
def load_questions(q_path):
|
17 |
+
with open(q_path, "r") as f:
|
18 |
+
data = json.load(f)
|
19 |
+
return data["questions"], data.get("ground_truth", [""] * len(data["questions"]))
|
20 |
+
|
21 |
+
def eval_file(pdf_path, q_path, domain="Finance"):
|
22 |
+
docs, _ = load_document(pdf_path, return_sections=True)
|
23 |
+
db = build_rag_pipeline(docs)
|
24 |
+
questions, golds = load_questions(q_path)
|
25 |
+
|
26 |
+
answers, contexts, latencies = [], [], []
|
27 |
+
for q in questions:
|
28 |
+
t0 = time.time()
|
29 |
+
ans, ctxs, _docs = query_rag_full(db, q, domain=domain)
|
30 |
+
latencies.append(round(time.time() - t0, 3))
|
31 |
+
answers.append(ans)
|
32 |
+
contexts.append([c for c in ctxs]) # ragas expects List[List[str]]
|
33 |
+
|
34 |
+
ds = Dataset.from_dict({
|
35 |
+
"question": questions,
|
36 |
+
"contexts": contexts,
|
37 |
+
"answer": answers,
|
38 |
+
"ground_truth": golds,
|
39 |
+
})
|
40 |
+
scores = evaluate(ds, metrics=[faithfulness, answer_relevance, context_recall])
|
41 |
+
try:
|
42 |
+
recs = scores.to_pandas().to_dict(orient="records")
|
43 |
+
except Exception:
|
44 |
+
recs = [scores] if isinstance(scores, dict) else [{"scores_raw": str(scores)}]
|
45 |
+
|
46 |
+
for i, r in enumerate(recs):
|
47 |
+
r["latency_s"] = latencies[i] if i < len(latencies) else None
|
48 |
+
|
49 |
+
return questions, answers, golds, recs
|
50 |
+
|
51 |
+
def main():
|
52 |
+
rows = []
|
53 |
+
for pdf_path in glob.glob(os.path.join(DATA_DIR, "*.pdf")):
|
54 |
+
base = os.path.splitext(os.path.basename(pdf_path))[0]
|
55 |
+
q_path = os.path.join(DATA_DIR, f"{base}.questions.json")
|
56 |
+
if not os.path.exists(q_path):
|
57 |
+
print(f"[WARN] Missing questions for {pdf_path} (expected {q_path}) — skipping.")
|
58 |
+
continue
|
59 |
+
print(f"[INFO] Evaluating {base} ...")
|
60 |
+
qs, ans, gold, recs = eval_file(pdf_path, q_path)
|
61 |
+
for i, r in enumerate(recs):
|
62 |
+
r.update({
|
63 |
+
"file": base,
|
64 |
+
"question": qs[i],
|
65 |
+
"answer": ans[i],
|
66 |
+
"ground_truth": gold[i],
|
67 |
+
})
|
68 |
+
rows.append(r)
|
69 |
+
|
70 |
+
if not rows:
|
71 |
+
print("[INFO] No evaluations produced.")
|
72 |
+
return
|
73 |
+
|
74 |
+
df = pd.DataFrame.from_records(rows)
|
75 |
+
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
76 |
+
out_csv = os.path.join(OUTPUT, f"aggregate_{stamp}.csv")
|
77 |
+
df.to_csv(out_csv, index=False)
|
78 |
+
print(f"[OK] Saved aggregate CSV → {out_csv}")
|
79 |
+
|
80 |
+
if __name__ == "__main__":
|
81 |
+
main()
|
doc_loader.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# doc_loader.py
|
2 |
+
import os
|
3 |
+
import io
|
4 |
+
from typing import Dict, Tuple
|
5 |
+
|
6 |
+
import fitz # PyMuPDF
|
7 |
+
import pdfplumber
|
8 |
+
import camelot
|
9 |
+
import pytesseract
|
10 |
+
from PIL import Image
|
11 |
+
|
12 |
+
|
13 |
+
def _extract_text_digital_pdf(path: str) -> str:
|
14 |
+
parts = []
|
15 |
+
try:
|
16 |
+
with fitz.open(path) as pdf:
|
17 |
+
for page in pdf:
|
18 |
+
txt = page.get_text("text") or ""
|
19 |
+
if txt.strip():
|
20 |
+
parts.append(txt.strip())
|
21 |
+
except Exception:
|
22 |
+
return ""
|
23 |
+
return "\n\n".join(parts).strip()
|
24 |
+
|
25 |
+
|
26 |
+
def _extract_text_scanned_pdf(path: str, zoom: float = 2.0, lang: str = "eng") -> str:
|
27 |
+
out = []
|
28 |
+
try:
|
29 |
+
with fitz.open(path) as pdf:
|
30 |
+
mat = fitz.Matrix(zoom, zoom)
|
31 |
+
for page in pdf:
|
32 |
+
pix = page.get_pixmap(matrix=mat)
|
33 |
+
img = Image.open(io.BytesIO(pix.tobytes("png")))
|
34 |
+
ocr_text = pytesseract.image_to_string(img, lang=lang) or ""
|
35 |
+
if ocr_text.strip():
|
36 |
+
out.append(ocr_text.strip())
|
37 |
+
except Exception:
|
38 |
+
return ""
|
39 |
+
return "\n\n".join(out).strip()
|
40 |
+
|
41 |
+
|
42 |
+
def extract_text_from_pdf(path: str, lang: str = "eng") -> str:
|
43 |
+
digital = _extract_text_digital_pdf(path)
|
44 |
+
if len(digital) >= 200:
|
45 |
+
return digital
|
46 |
+
return _extract_text_scanned_pdf(path, zoom=2.0, lang=lang)
|
47 |
+
|
48 |
+
|
49 |
+
def extract_tables_from_pdf(path: str) -> str:
|
50 |
+
# Camelot first (lattice → stream)
|
51 |
+
try:
|
52 |
+
blocks = []
|
53 |
+
for flavor in ("lattice", "stream"):
|
54 |
+
try:
|
55 |
+
tbs = camelot.read_pdf(path, pages="all", flavor=flavor)
|
56 |
+
if tbs and len(tbs) > 0:
|
57 |
+
for i, tb in enumerate(tbs):
|
58 |
+
rows = [" | ".join(map(str, row)) for row in tb.df.values.tolist()]
|
59 |
+
blocks.append(f"Table ({flavor}) {i+1}:\n" + "\n".join(rows))
|
60 |
+
break
|
61 |
+
except Exception:
|
62 |
+
continue
|
63 |
+
if blocks:
|
64 |
+
return "\n\n".join(blocks).strip()
|
65 |
+
except Exception:
|
66 |
+
pass
|
67 |
+
|
68 |
+
# Fallback to pdfplumber
|
69 |
+
try:
|
70 |
+
blocks = []
|
71 |
+
with pdfplumber.open(path) as pdf:
|
72 |
+
for pageno, page in enumerate(pdf.pages, start=1):
|
73 |
+
tables = page.extract_tables() or []
|
74 |
+
for t_i, table in enumerate(tables, start=1):
|
75 |
+
rows = [" | ".join([c if c is not None else "" for c in row]) for row in table]
|
76 |
+
blocks.append(f"Table (plumber) p.{pageno} #{t_i}:\n" + "\n".join(rows))
|
77 |
+
return "\n\n".join(blocks).strip()
|
78 |
+
except Exception:
|
79 |
+
return ""
|
80 |
+
|
81 |
+
|
82 |
+
def extract_layout_text_pdf(path: str) -> str:
|
83 |
+
try:
|
84 |
+
parts = []
|
85 |
+
with pdfplumber.open(path) as pdf:
|
86 |
+
for page in pdf.pages:
|
87 |
+
txt = page.extract_text(x_tolerance=2, y_tolerance=2) or ""
|
88 |
+
if txt.strip():
|
89 |
+
parts.append(txt.strip())
|
90 |
+
return "\n\n".join(parts).strip()
|
91 |
+
except Exception:
|
92 |
+
return ""
|
93 |
+
|
94 |
+
|
95 |
+
def extract_figures_and_captions_pdf(path: str) -> str:
|
96 |
+
try:
|
97 |
+
out = []
|
98 |
+
with pdfplumber.open(path) as pdf:
|
99 |
+
for pageno, page in enumerate(pdf.pages, start=1):
|
100 |
+
images = page.images or []
|
101 |
+
for idx, im in enumerate(images, start=1):
|
102 |
+
x0, y0, x1, y1 = float(im.get("x0", 0)), float(im.get("top", 0)), float(im.get("x1", 0)), float(im.get("bottom", 0))
|
103 |
+
ph = float(page.height)
|
104 |
+
band_top = min(y1 + 5, ph)
|
105 |
+
band_bottom = min(y1 + 60, ph)
|
106 |
+
cap = ""
|
107 |
+
try:
|
108 |
+
band = page.within_bbox((x0, band_top, x1, band_bottom))
|
109 |
+
cap = (band.extract_text() or "").strip()
|
110 |
+
except Exception:
|
111 |
+
pass
|
112 |
+
out.append(f"Figure p.{pageno} #{idx} bbox=({int(x0)},{int(y0)},{int(x1)},{int(y1)})\nCaption: {cap or 'N/A'}")
|
113 |
+
return "\n\n".join(out).strip()
|
114 |
+
except Exception:
|
115 |
+
return ""
|
116 |
+
|
117 |
+
|
118 |
+
def extract_text_from_image(path: str, lang: str = "eng") -> str:
|
119 |
+
try:
|
120 |
+
img = Image.open(path)
|
121 |
+
return (pytesseract.image_to_string(img, lang=lang) or "").strip()
|
122 |
+
except Exception:
|
123 |
+
return ""
|
124 |
+
|
125 |
+
|
126 |
+
def load_document(path: str, return_sections: bool = False, lang: str = "eng"):
|
127 |
+
ext = os.path.splitext(path)[-1].lower()
|
128 |
+
sections: Dict[str, str] = {}
|
129 |
+
|
130 |
+
if ext == ".pdf":
|
131 |
+
sections["Text"] = extract_text_from_pdf(path, lang=lang)
|
132 |
+
tbl = extract_tables_from_pdf(path)
|
133 |
+
if tbl:
|
134 |
+
sections["Tables"] = tbl
|
135 |
+
layout = extract_layout_text_pdf(path)
|
136 |
+
if layout:
|
137 |
+
sections["Layout"] = layout
|
138 |
+
figs = extract_figures_and_captions_pdf(path)
|
139 |
+
if figs:
|
140 |
+
sections["Figures"] = figs
|
141 |
+
|
142 |
+
elif ext in [".jpg", ".jpeg", ".png"]:
|
143 |
+
sections["OCR"] = extract_text_from_image(path, lang=lang)
|
144 |
+
|
145 |
+
else:
|
146 |
+
raise ValueError(f"Unsupported file type: {ext}")
|
147 |
+
|
148 |
+
merged = "\n\n".join([v for v in sections.values() if v and v.strip()]).strip()
|
149 |
+
if return_sections:
|
150 |
+
return merged, sections
|
151 |
+
return merged
|
figure_extractor.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# figure_extractor.py
|
2 |
+
# figure_extractor.py
|
3 |
+
import os
|
4 |
+
import io
|
5 |
+
from typing import List, Dict, Any
|
6 |
+
|
7 |
+
import fitz # PyMuPDF
|
8 |
+
import pdfplumber
|
9 |
+
import pytesseract
|
10 |
+
from PIL import Image
|
11 |
+
|
12 |
+
|
13 |
+
def extract_figures(pdf_path: str, out_dir: str = "figures", lang: str = "eng") -> List[Dict[str, Any]]:
|
14 |
+
"""
|
15 |
+
Detect bitmap figures in a PDF, crop them, OCR content, and read a caption band.
|
16 |
+
Returns a list with metadata used for indexing + UI previews.
|
17 |
+
"""
|
18 |
+
os.makedirs(out_dir, exist_ok=True)
|
19 |
+
results: List[Dict[str, Any]] = []
|
20 |
+
|
21 |
+
with pdfplumber.open(pdf_path) as pl_doc:
|
22 |
+
fz_doc = fitz.open(pdf_path)
|
23 |
+
|
24 |
+
for pageno in range(len(fz_doc)):
|
25 |
+
pl_page = pl_doc.pages[pageno]
|
26 |
+
fz_page = fz_doc[pageno]
|
27 |
+
|
28 |
+
images = pl_page.images or []
|
29 |
+
if not images:
|
30 |
+
continue
|
31 |
+
|
32 |
+
page_h = float(pl_page.height)
|
33 |
+
page_w = float(pl_page.width)
|
34 |
+
|
35 |
+
for idx, im in enumerate(images, start=1):
|
36 |
+
x0 = float(im.get("x0", 0))
|
37 |
+
y0 = float(im.get("top", 0))
|
38 |
+
x1 = float(im.get("x1", 0))
|
39 |
+
y1 = float(im.get("bottom", 0))
|
40 |
+
|
41 |
+
w = max(1.0, x1 - x0)
|
42 |
+
h = max(1.0, y1 - y0)
|
43 |
+
area = w * h
|
44 |
+
page_area = page_w * page_h
|
45 |
+
if area < 10000 or area < 0.01 * page_area:
|
46 |
+
continue # skip tiny artifacts/icons
|
47 |
+
|
48 |
+
try:
|
49 |
+
rect = fitz.Rect(x0, y0, x1, y1)
|
50 |
+
pix = fz_page.get_pixmap(clip=rect, alpha=False)
|
51 |
+
img_path = os.path.join(out_dir, f"page_{pageno+1}_fig_{idx}.png")
|
52 |
+
pix.save(img_path)
|
53 |
+
except Exception:
|
54 |
+
continue
|
55 |
+
|
56 |
+
ocr_text = ""
|
57 |
+
try:
|
58 |
+
with open(img_path, "rb") as fh:
|
59 |
+
img = Image.open(io.BytesIO(fh.read()))
|
60 |
+
ocr_text = (pytesseract.image_to_string(img, lang=lang) or "").strip()
|
61 |
+
except Exception:
|
62 |
+
pass
|
63 |
+
|
64 |
+
caption = ""
|
65 |
+
try:
|
66 |
+
band_top = min(y1 + 5, page_h)
|
67 |
+
band_bottom = min(y1 + 60, page_h)
|
68 |
+
if band_bottom > band_top:
|
69 |
+
band = pl_page.within_bbox((x0, band_top, x1, band_bottom))
|
70 |
+
caption = (band.extract_text() or "").strip()
|
71 |
+
except Exception:
|
72 |
+
pass
|
73 |
+
|
74 |
+
tags = ["figure"]
|
75 |
+
low = (ocr_text + " " + caption).lower()
|
76 |
+
if any(k in low for k in ["chart", "graph", "trend", "bar", "line", "pie"]):
|
77 |
+
tags.append("chart")
|
78 |
+
if any(k in low for k in [
|
79 |
+
"revenue", "profit", "income", "eps", "cash flow",
|
80 |
+
"operating", "balance", "assets", "liabilities", "equity",
|
81 |
+
"ratio", "margin", "ebit", "ebitda"
|
82 |
+
]):
|
83 |
+
tags.append("finance")
|
84 |
+
|
85 |
+
results.append({
|
86 |
+
"page": pageno + 1,
|
87 |
+
"bbox": (int(x0), int(y0), int(x1), int(y1)),
|
88 |
+
"path": img_path,
|
89 |
+
"caption": caption,
|
90 |
+
"ocr_text": ocr_text,
|
91 |
+
"tags": tags,
|
92 |
+
})
|
93 |
+
|
94 |
+
return results
|
ragas_eval.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ragas_eval.py
|
2 |
+
# Convenience alias to run a single sample quickly.
|
3 |
+
import os, json, time
|
4 |
+
from datetime import datetime
|
5 |
+
import pandas as pd
|
6 |
+
from datasets import Dataset
|
7 |
+
from ragas import evaluate
|
8 |
+
from ragas.metrics import faithfulness, answer_relevance, context_recall
|
9 |
+
|
10 |
+
from doc_loader import load_document
|
11 |
+
from rag_pipeline import build_rag_pipeline, query_rag_full
|
12 |
+
|
13 |
+
DOC_PATH = "samples/finance_report.pdf"
|
14 |
+
OUTPUT_DIR = "eval_runs"
|
15 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
16 |
+
|
17 |
+
def run():
|
18 |
+
docs, sections = load_document(DOC_PATH, return_sections=True)
|
19 |
+
db = build_rag_pipeline(docs)
|
20 |
+
|
21 |
+
questions = [
|
22 |
+
"What was the company’s net profit in 2022?",
|
23 |
+
"What is the EPS reported for Q3 2023?",
|
24 |
+
"Summarize the auditor’s opinion in one sentence.",
|
25 |
+
]
|
26 |
+
gold = ["", "", ""] # fill if known
|
27 |
+
|
28 |
+
answers, contexts, latencies = [], [], []
|
29 |
+
for q in questions:
|
30 |
+
t0 = time.time()
|
31 |
+
ans, ctxs, _ = query_rag_full(db, q, domain="Finance")
|
32 |
+
lat = round(time.time() - t0, 3)
|
33 |
+
answers.append(ans)
|
34 |
+
contexts.append(ctxs)
|
35 |
+
latencies.append(lat)
|
36 |
+
|
37 |
+
ds = Dataset.from_dict({
|
38 |
+
"question": questions,
|
39 |
+
"contexts": [list(c) for c in contexts],
|
40 |
+
"answer": answers,
|
41 |
+
"ground_truth": gold,
|
42 |
+
})
|
43 |
+
scores = evaluate(ds, metrics=[faithfulness, answer_relevance, context_recall])
|
44 |
+
|
45 |
+
# persist
|
46 |
+
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
47 |
+
out_json = os.path.join(OUTPUT_DIR, f"ragas_{stamp}.json")
|
48 |
+
with open(out_json, "w") as f:
|
49 |
+
json.dump({
|
50 |
+
"doc_path": DOC_PATH,
|
51 |
+
"questions": questions,
|
52 |
+
"answers": answers,
|
53 |
+
"latencies": latencies,
|
54 |
+
"scores": getattr(scores, "to_dict", lambda: str(scores))(),
|
55 |
+
}, f, indent=2)
|
56 |
+
|
57 |
+
print(f"[OK] Saved → {out_json}")
|
58 |
+
|
59 |
+
if __name__ == "__main__":
|
60 |
+
run()
|
utils.py
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# utils.py
|
2 |
+
from __future__ import annotations
|
3 |
+
|
4 |
+
import io
|
5 |
+
import os
|
6 |
+
import tempfile
|
7 |
+
from typing import Tuple, Optional, Union
|
8 |
+
|
9 |
+
import cv2
|
10 |
+
import numpy as np
|
11 |
+
import pytesseract
|
12 |
+
from PIL import Image
|
13 |
+
from PyPDF2 import PdfReader
|
14 |
+
|
15 |
+
# -------------------------------
|
16 |
+
# Small helpers
|
17 |
+
# -------------------------------
|
18 |
+
|
19 |
+
def _to_path(file_or_path: Union[str, bytes, os.PathLike, io.BufferedIOBase]) -> Tuple[str, Optional[str]]:
|
20 |
+
"""
|
21 |
+
Ensure we have a filesystem path. If a file-like is provided, write it to a temp file.
|
22 |
+
Returns (path, tmp_path); tmp_path is None if no temp file was created.
|
23 |
+
"""
|
24 |
+
if isinstance(file_or_path, (str, bytes, os.PathLike)):
|
25 |
+
return str(file_or_path), None
|
26 |
+
# file-like → persist to a temp file
|
27 |
+
suffix = ""
|
28 |
+
try:
|
29 |
+
name = getattr(file_or_path, "name", "")
|
30 |
+
if isinstance(name, str) and "." in name:
|
31 |
+
suffix = f".{name.rsplit('.', 1)[-1]}"
|
32 |
+
except Exception:
|
33 |
+
pass
|
34 |
+
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
|
35 |
+
try:
|
36 |
+
# rewind if possible
|
37 |
+
if hasattr(file_or_path, "seek"):
|
38 |
+
try: file_or_path.seek(0)
|
39 |
+
except Exception: pass
|
40 |
+
tmp.write(file_or_path.read())
|
41 |
+
finally:
|
42 |
+
tmp.flush()
|
43 |
+
tmp.close()
|
44 |
+
return tmp.name, tmp.name
|
45 |
+
|
46 |
+
|
47 |
+
def _cleanup_tmp(tmp_path: Optional[str]) -> None:
|
48 |
+
if tmp_path and os.path.exists(tmp_path):
|
49 |
+
try:
|
50 |
+
os.remove(tmp_path)
|
51 |
+
except Exception:
|
52 |
+
pass
|
53 |
+
|
54 |
+
|
55 |
+
# -------------------------------
|
56 |
+
# PDF Text Extraction (PyPDF2 fast path)
|
57 |
+
# -------------------------------
|
58 |
+
|
59 |
+
def extract_text_from_pdf(file_or_path) -> str:
|
60 |
+
"""
|
61 |
+
Extract plain text from a (digital) PDF using PyPDF2.
|
62 |
+
Silent fallback (returns "") on failure to avoid polluting embeddings.
|
63 |
+
"""
|
64 |
+
path, tmp = _to_path(file_or_path)
|
65 |
+
try:
|
66 |
+
reader = PdfReader(path)
|
67 |
+
pages_text = []
|
68 |
+
for page in reader.pages:
|
69 |
+
try:
|
70 |
+
t = page.extract_text()
|
71 |
+
if t:
|
72 |
+
pages_text.append(t)
|
73 |
+
except Exception:
|
74 |
+
continue
|
75 |
+
return "\n".join(pages_text).strip()
|
76 |
+
except Exception:
|
77 |
+
return ""
|
78 |
+
finally:
|
79 |
+
_cleanup_tmp(tmp)
|
80 |
+
|
81 |
+
|
82 |
+
# -------------------------------
|
83 |
+
# Image OCR
|
84 |
+
# -------------------------------
|
85 |
+
|
86 |
+
def extract_text_from_image(file_or_path, lang: str = "eng") -> str:
|
87 |
+
"""
|
88 |
+
Basic OCR on an image (jpg/png). Handles file path or file-like.
|
89 |
+
"""
|
90 |
+
path, tmp = _to_path(file_or_path)
|
91 |
+
try:
|
92 |
+
img = Image.open(path)
|
93 |
+
return (pytesseract.image_to_string(img, lang=lang) or "").strip()
|
94 |
+
except Exception:
|
95 |
+
return ""
|
96 |
+
finally:
|
97 |
+
_cleanup_tmp(tmp)
|
98 |
+
|
99 |
+
|
100 |
+
# -------------------------------
|
101 |
+
# Tables from PDF (Camelot stream)
|
102 |
+
# -------------------------------
|
103 |
+
|
104 |
+
def extract_tables_from_pdf(file_path: str) -> str:
|
105 |
+
"""
|
106 |
+
Extract tables using Camelot (stream flavor).
|
107 |
+
Returns a single plain text block.
|
108 |
+
"""
|
109 |
+
try:
|
110 |
+
import camelot
|
111 |
+
tables = camelot.read_pdf(file_path, pages="all", flavor="stream")
|
112 |
+
extracted = []
|
113 |
+
for i, tb in enumerate(tables):
|
114 |
+
rows = [" | ".join(map(str, row)) for row in tb.df.values.tolist()]
|
115 |
+
extracted.append(f"Table {i+1}:\n" + "\n".join(rows))
|
116 |
+
return "\n\n".join(extracted).strip() if extracted else ""
|
117 |
+
except Exception:
|
118 |
+
return ""
|
119 |
+
|
120 |
+
|
121 |
+
# -------------------------------
|
122 |
+
# Layout-aware text (pdfplumber)
|
123 |
+
# -------------------------------
|
124 |
+
|
125 |
+
def extract_layout_text(file_path: str) -> str:
|
126 |
+
"""
|
127 |
+
Preserve headings/paragraphs via pdfplumber tolerances.
|
128 |
+
"""
|
129 |
+
try:
|
130 |
+
import pdfplumber
|
131 |
+
parts = []
|
132 |
+
with pdfplumber.open(file_path) as pdf:
|
133 |
+
for page in pdf.pages:
|
134 |
+
txt = page.extract_text(x_tolerance=2, y_tolerance=2) or ""
|
135 |
+
if txt.strip():
|
136 |
+
parts.append(txt.strip())
|
137 |
+
return "\n\n".join(parts).strip()
|
138 |
+
except Exception:
|
139 |
+
return ""
|
140 |
+
|
141 |
+
|
142 |
+
# -------------------------------
|
143 |
+
# Chart / Graph OCR (OpenCV + Tesseract)
|
144 |
+
# -------------------------------
|
145 |
+
|
146 |
+
def extract_chart_text(image_input: Union[str, np.ndarray], lang: str = "eng") -> str:
|
147 |
+
"""
|
148 |
+
Extract textual info from charts/graphs using a robust preprocessing pipeline:
|
149 |
+
- grayscale
|
150 |
+
- morphological tophat (remove background)
|
151 |
+
- adaptive threshold (handles light/dark themes)
|
152 |
+
- median denoise
|
153 |
+
- OCR with conservative psm
|
154 |
+
Accepts a file path or a BGR numpy array (OpenCV).
|
155 |
+
"""
|
156 |
+
try:
|
157 |
+
if isinstance(image_input, str):
|
158 |
+
img = cv2.imread(image_input)
|
159 |
+
else:
|
160 |
+
img = image_input
|
161 |
+
if img is None:
|
162 |
+
return ""
|
163 |
+
|
164 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
165 |
+
|
166 |
+
# background suppression (tophat)
|
167 |
+
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9, 9))
|
168 |
+
tophat = cv2.morphologyEx(gray, cv2.MORPH_TOPHAT, kernel)
|
169 |
+
|
170 |
+
# adaptive threshold (robust to varying backgrounds)
|
171 |
+
thr = cv2.adaptiveThreshold(
|
172 |
+
tophat, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
173 |
+
cv2.THRESH_BINARY, 31, 15
|
174 |
+
)
|
175 |
+
|
176 |
+
# slight opening to remove specks, then median blur
|
177 |
+
opened = cv2.morphologyEx(thr, cv2.MORPH_OPEN, np.ones((2, 2), np.uint8))
|
178 |
+
denoised = cv2.medianBlur(opened, 3)
|
179 |
+
|
180 |
+
# Some charts have light text on dark bg → try inverted too and pick longer text
|
181 |
+
inverted = cv2.bitwise_not(denoised)
|
182 |
+
|
183 |
+
cfg = "--oem 3 --psm 6" # assume a block of text
|
184 |
+
txt1 = pytesseract.image_to_string(denoised, lang=lang, config=config_str(cfg))
|
185 |
+
txt2 = pytesseract.image_to_string(inverted, lang=lang, config=config_str(cfg))
|
186 |
+
|
187 |
+
text = (txt1 or "")
|
188 |
+
if len((txt2 or "").strip()) > len(text.strip()):
|
189 |
+
text = txt2
|
190 |
+
|
191 |
+
return text.strip()
|
192 |
+
except Exception:
|
193 |
+
return ""
|
194 |
+
|
195 |
+
|
196 |
+
def config_str(base: str) -> str:
|
197 |
+
"""
|
198 |
+
Helper to make it obvious where to append tesseract configs later.
|
199 |
+
"""
|
200 |
+
return base
|
201 |
+
|
202 |
+
|
203 |
+
# -------------------------------
|
204 |
+
# Image-Text correlation helper
|
205 |
+
# -------------------------------
|
206 |
+
|
207 |
+
def merge_image_with_caption(image_text: str, caption: str) -> str:
|
208 |
+
"""
|
209 |
+
Combine OCR text + caption into a single blob for embedding.
|
210 |
+
"""
|
211 |
+
image_text = (image_text or "").strip() or "No visible text"
|
212 |
+
caption = (caption or "").strip() or "No caption"
|
213 |
+
return f"Image Content: {image_text}\nCaption: {caption}"
|