Praneetha N commited on
Commit
1e5cf81
·
1 Parent(s): e05fa86

Add supporting scripts for RAG pipeline

Browse files
Files changed (5) hide show
  1. dataset_eval.py +81 -0
  2. doc_loader.py +151 -0
  3. figure_extractor.py +94 -0
  4. ragas_eval.py +60 -0
  5. utils.py +213 -0
dataset_eval.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset_eval.py
2
+ import os, json, glob, time
3
+ from datetime import datetime
4
+ import pandas as pd
5
+ from datasets import Dataset
6
+ from ragas import evaluate
7
+ from ragas.metrics import faithfulness, answer_relevance, context_recall
8
+
9
+ from doc_loader import load_document
10
+ from rag_pipeline import build_rag_pipeline, query_rag_full
11
+
12
+ DATA_DIR = "datasets/finance"
13
+ OUTPUT = "eval_runs"
14
+ os.makedirs(OUTPUT, exist_ok=True)
15
+
16
+ def load_questions(q_path):
17
+ with open(q_path, "r") as f:
18
+ data = json.load(f)
19
+ return data["questions"], data.get("ground_truth", [""] * len(data["questions"]))
20
+
21
+ def eval_file(pdf_path, q_path, domain="Finance"):
22
+ docs, _ = load_document(pdf_path, return_sections=True)
23
+ db = build_rag_pipeline(docs)
24
+ questions, golds = load_questions(q_path)
25
+
26
+ answers, contexts, latencies = [], [], []
27
+ for q in questions:
28
+ t0 = time.time()
29
+ ans, ctxs, _docs = query_rag_full(db, q, domain=domain)
30
+ latencies.append(round(time.time() - t0, 3))
31
+ answers.append(ans)
32
+ contexts.append([c for c in ctxs]) # ragas expects List[List[str]]
33
+
34
+ ds = Dataset.from_dict({
35
+ "question": questions,
36
+ "contexts": contexts,
37
+ "answer": answers,
38
+ "ground_truth": golds,
39
+ })
40
+ scores = evaluate(ds, metrics=[faithfulness, answer_relevance, context_recall])
41
+ try:
42
+ recs = scores.to_pandas().to_dict(orient="records")
43
+ except Exception:
44
+ recs = [scores] if isinstance(scores, dict) else [{"scores_raw": str(scores)}]
45
+
46
+ for i, r in enumerate(recs):
47
+ r["latency_s"] = latencies[i] if i < len(latencies) else None
48
+
49
+ return questions, answers, golds, recs
50
+
51
+ def main():
52
+ rows = []
53
+ for pdf_path in glob.glob(os.path.join(DATA_DIR, "*.pdf")):
54
+ base = os.path.splitext(os.path.basename(pdf_path))[0]
55
+ q_path = os.path.join(DATA_DIR, f"{base}.questions.json")
56
+ if not os.path.exists(q_path):
57
+ print(f"[WARN] Missing questions for {pdf_path} (expected {q_path}) — skipping.")
58
+ continue
59
+ print(f"[INFO] Evaluating {base} ...")
60
+ qs, ans, gold, recs = eval_file(pdf_path, q_path)
61
+ for i, r in enumerate(recs):
62
+ r.update({
63
+ "file": base,
64
+ "question": qs[i],
65
+ "answer": ans[i],
66
+ "ground_truth": gold[i],
67
+ })
68
+ rows.append(r)
69
+
70
+ if not rows:
71
+ print("[INFO] No evaluations produced.")
72
+ return
73
+
74
+ df = pd.DataFrame.from_records(rows)
75
+ stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
76
+ out_csv = os.path.join(OUTPUT, f"aggregate_{stamp}.csv")
77
+ df.to_csv(out_csv, index=False)
78
+ print(f"[OK] Saved aggregate CSV → {out_csv}")
79
+
80
+ if __name__ == "__main__":
81
+ main()
doc_loader.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # doc_loader.py
2
+ import os
3
+ import io
4
+ from typing import Dict, Tuple
5
+
6
+ import fitz # PyMuPDF
7
+ import pdfplumber
8
+ import camelot
9
+ import pytesseract
10
+ from PIL import Image
11
+
12
+
13
+ def _extract_text_digital_pdf(path: str) -> str:
14
+ parts = []
15
+ try:
16
+ with fitz.open(path) as pdf:
17
+ for page in pdf:
18
+ txt = page.get_text("text") or ""
19
+ if txt.strip():
20
+ parts.append(txt.strip())
21
+ except Exception:
22
+ return ""
23
+ return "\n\n".join(parts).strip()
24
+
25
+
26
+ def _extract_text_scanned_pdf(path: str, zoom: float = 2.0, lang: str = "eng") -> str:
27
+ out = []
28
+ try:
29
+ with fitz.open(path) as pdf:
30
+ mat = fitz.Matrix(zoom, zoom)
31
+ for page in pdf:
32
+ pix = page.get_pixmap(matrix=mat)
33
+ img = Image.open(io.BytesIO(pix.tobytes("png")))
34
+ ocr_text = pytesseract.image_to_string(img, lang=lang) or ""
35
+ if ocr_text.strip():
36
+ out.append(ocr_text.strip())
37
+ except Exception:
38
+ return ""
39
+ return "\n\n".join(out).strip()
40
+
41
+
42
+ def extract_text_from_pdf(path: str, lang: str = "eng") -> str:
43
+ digital = _extract_text_digital_pdf(path)
44
+ if len(digital) >= 200:
45
+ return digital
46
+ return _extract_text_scanned_pdf(path, zoom=2.0, lang=lang)
47
+
48
+
49
+ def extract_tables_from_pdf(path: str) -> str:
50
+ # Camelot first (lattice → stream)
51
+ try:
52
+ blocks = []
53
+ for flavor in ("lattice", "stream"):
54
+ try:
55
+ tbs = camelot.read_pdf(path, pages="all", flavor=flavor)
56
+ if tbs and len(tbs) > 0:
57
+ for i, tb in enumerate(tbs):
58
+ rows = [" | ".join(map(str, row)) for row in tb.df.values.tolist()]
59
+ blocks.append(f"Table ({flavor}) {i+1}:\n" + "\n".join(rows))
60
+ break
61
+ except Exception:
62
+ continue
63
+ if blocks:
64
+ return "\n\n".join(blocks).strip()
65
+ except Exception:
66
+ pass
67
+
68
+ # Fallback to pdfplumber
69
+ try:
70
+ blocks = []
71
+ with pdfplumber.open(path) as pdf:
72
+ for pageno, page in enumerate(pdf.pages, start=1):
73
+ tables = page.extract_tables() or []
74
+ for t_i, table in enumerate(tables, start=1):
75
+ rows = [" | ".join([c if c is not None else "" for c in row]) for row in table]
76
+ blocks.append(f"Table (plumber) p.{pageno} #{t_i}:\n" + "\n".join(rows))
77
+ return "\n\n".join(blocks).strip()
78
+ except Exception:
79
+ return ""
80
+
81
+
82
+ def extract_layout_text_pdf(path: str) -> str:
83
+ try:
84
+ parts = []
85
+ with pdfplumber.open(path) as pdf:
86
+ for page in pdf.pages:
87
+ txt = page.extract_text(x_tolerance=2, y_tolerance=2) or ""
88
+ if txt.strip():
89
+ parts.append(txt.strip())
90
+ return "\n\n".join(parts).strip()
91
+ except Exception:
92
+ return ""
93
+
94
+
95
+ def extract_figures_and_captions_pdf(path: str) -> str:
96
+ try:
97
+ out = []
98
+ with pdfplumber.open(path) as pdf:
99
+ for pageno, page in enumerate(pdf.pages, start=1):
100
+ images = page.images or []
101
+ for idx, im in enumerate(images, start=1):
102
+ x0, y0, x1, y1 = float(im.get("x0", 0)), float(im.get("top", 0)), float(im.get("x1", 0)), float(im.get("bottom", 0))
103
+ ph = float(page.height)
104
+ band_top = min(y1 + 5, ph)
105
+ band_bottom = min(y1 + 60, ph)
106
+ cap = ""
107
+ try:
108
+ band = page.within_bbox((x0, band_top, x1, band_bottom))
109
+ cap = (band.extract_text() or "").strip()
110
+ except Exception:
111
+ pass
112
+ out.append(f"Figure p.{pageno} #{idx} bbox=({int(x0)},{int(y0)},{int(x1)},{int(y1)})\nCaption: {cap or 'N/A'}")
113
+ return "\n\n".join(out).strip()
114
+ except Exception:
115
+ return ""
116
+
117
+
118
+ def extract_text_from_image(path: str, lang: str = "eng") -> str:
119
+ try:
120
+ img = Image.open(path)
121
+ return (pytesseract.image_to_string(img, lang=lang) or "").strip()
122
+ except Exception:
123
+ return ""
124
+
125
+
126
+ def load_document(path: str, return_sections: bool = False, lang: str = "eng"):
127
+ ext = os.path.splitext(path)[-1].lower()
128
+ sections: Dict[str, str] = {}
129
+
130
+ if ext == ".pdf":
131
+ sections["Text"] = extract_text_from_pdf(path, lang=lang)
132
+ tbl = extract_tables_from_pdf(path)
133
+ if tbl:
134
+ sections["Tables"] = tbl
135
+ layout = extract_layout_text_pdf(path)
136
+ if layout:
137
+ sections["Layout"] = layout
138
+ figs = extract_figures_and_captions_pdf(path)
139
+ if figs:
140
+ sections["Figures"] = figs
141
+
142
+ elif ext in [".jpg", ".jpeg", ".png"]:
143
+ sections["OCR"] = extract_text_from_image(path, lang=lang)
144
+
145
+ else:
146
+ raise ValueError(f"Unsupported file type: {ext}")
147
+
148
+ merged = "\n\n".join([v for v in sections.values() if v and v.strip()]).strip()
149
+ if return_sections:
150
+ return merged, sections
151
+ return merged
figure_extractor.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # figure_extractor.py
2
+ # figure_extractor.py
3
+ import os
4
+ import io
5
+ from typing import List, Dict, Any
6
+
7
+ import fitz # PyMuPDF
8
+ import pdfplumber
9
+ import pytesseract
10
+ from PIL import Image
11
+
12
+
13
+ def extract_figures(pdf_path: str, out_dir: str = "figures", lang: str = "eng") -> List[Dict[str, Any]]:
14
+ """
15
+ Detect bitmap figures in a PDF, crop them, OCR content, and read a caption band.
16
+ Returns a list with metadata used for indexing + UI previews.
17
+ """
18
+ os.makedirs(out_dir, exist_ok=True)
19
+ results: List[Dict[str, Any]] = []
20
+
21
+ with pdfplumber.open(pdf_path) as pl_doc:
22
+ fz_doc = fitz.open(pdf_path)
23
+
24
+ for pageno in range(len(fz_doc)):
25
+ pl_page = pl_doc.pages[pageno]
26
+ fz_page = fz_doc[pageno]
27
+
28
+ images = pl_page.images or []
29
+ if not images:
30
+ continue
31
+
32
+ page_h = float(pl_page.height)
33
+ page_w = float(pl_page.width)
34
+
35
+ for idx, im in enumerate(images, start=1):
36
+ x0 = float(im.get("x0", 0))
37
+ y0 = float(im.get("top", 0))
38
+ x1 = float(im.get("x1", 0))
39
+ y1 = float(im.get("bottom", 0))
40
+
41
+ w = max(1.0, x1 - x0)
42
+ h = max(1.0, y1 - y0)
43
+ area = w * h
44
+ page_area = page_w * page_h
45
+ if area < 10000 or area < 0.01 * page_area:
46
+ continue # skip tiny artifacts/icons
47
+
48
+ try:
49
+ rect = fitz.Rect(x0, y0, x1, y1)
50
+ pix = fz_page.get_pixmap(clip=rect, alpha=False)
51
+ img_path = os.path.join(out_dir, f"page_{pageno+1}_fig_{idx}.png")
52
+ pix.save(img_path)
53
+ except Exception:
54
+ continue
55
+
56
+ ocr_text = ""
57
+ try:
58
+ with open(img_path, "rb") as fh:
59
+ img = Image.open(io.BytesIO(fh.read()))
60
+ ocr_text = (pytesseract.image_to_string(img, lang=lang) or "").strip()
61
+ except Exception:
62
+ pass
63
+
64
+ caption = ""
65
+ try:
66
+ band_top = min(y1 + 5, page_h)
67
+ band_bottom = min(y1 + 60, page_h)
68
+ if band_bottom > band_top:
69
+ band = pl_page.within_bbox((x0, band_top, x1, band_bottom))
70
+ caption = (band.extract_text() or "").strip()
71
+ except Exception:
72
+ pass
73
+
74
+ tags = ["figure"]
75
+ low = (ocr_text + " " + caption).lower()
76
+ if any(k in low for k in ["chart", "graph", "trend", "bar", "line", "pie"]):
77
+ tags.append("chart")
78
+ if any(k in low for k in [
79
+ "revenue", "profit", "income", "eps", "cash flow",
80
+ "operating", "balance", "assets", "liabilities", "equity",
81
+ "ratio", "margin", "ebit", "ebitda"
82
+ ]):
83
+ tags.append("finance")
84
+
85
+ results.append({
86
+ "page": pageno + 1,
87
+ "bbox": (int(x0), int(y0), int(x1), int(y1)),
88
+ "path": img_path,
89
+ "caption": caption,
90
+ "ocr_text": ocr_text,
91
+ "tags": tags,
92
+ })
93
+
94
+ return results
ragas_eval.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ragas_eval.py
2
+ # Convenience alias to run a single sample quickly.
3
+ import os, json, time
4
+ from datetime import datetime
5
+ import pandas as pd
6
+ from datasets import Dataset
7
+ from ragas import evaluate
8
+ from ragas.metrics import faithfulness, answer_relevance, context_recall
9
+
10
+ from doc_loader import load_document
11
+ from rag_pipeline import build_rag_pipeline, query_rag_full
12
+
13
+ DOC_PATH = "samples/finance_report.pdf"
14
+ OUTPUT_DIR = "eval_runs"
15
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
16
+
17
+ def run():
18
+ docs, sections = load_document(DOC_PATH, return_sections=True)
19
+ db = build_rag_pipeline(docs)
20
+
21
+ questions = [
22
+ "What was the company’s net profit in 2022?",
23
+ "What is the EPS reported for Q3 2023?",
24
+ "Summarize the auditor’s opinion in one sentence.",
25
+ ]
26
+ gold = ["", "", ""] # fill if known
27
+
28
+ answers, contexts, latencies = [], [], []
29
+ for q in questions:
30
+ t0 = time.time()
31
+ ans, ctxs, _ = query_rag_full(db, q, domain="Finance")
32
+ lat = round(time.time() - t0, 3)
33
+ answers.append(ans)
34
+ contexts.append(ctxs)
35
+ latencies.append(lat)
36
+
37
+ ds = Dataset.from_dict({
38
+ "question": questions,
39
+ "contexts": [list(c) for c in contexts],
40
+ "answer": answers,
41
+ "ground_truth": gold,
42
+ })
43
+ scores = evaluate(ds, metrics=[faithfulness, answer_relevance, context_recall])
44
+
45
+ # persist
46
+ stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
47
+ out_json = os.path.join(OUTPUT_DIR, f"ragas_{stamp}.json")
48
+ with open(out_json, "w") as f:
49
+ json.dump({
50
+ "doc_path": DOC_PATH,
51
+ "questions": questions,
52
+ "answers": answers,
53
+ "latencies": latencies,
54
+ "scores": getattr(scores, "to_dict", lambda: str(scores))(),
55
+ }, f, indent=2)
56
+
57
+ print(f"[OK] Saved → {out_json}")
58
+
59
+ if __name__ == "__main__":
60
+ run()
utils.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils.py
2
+ from __future__ import annotations
3
+
4
+ import io
5
+ import os
6
+ import tempfile
7
+ from typing import Tuple, Optional, Union
8
+
9
+ import cv2
10
+ import numpy as np
11
+ import pytesseract
12
+ from PIL import Image
13
+ from PyPDF2 import PdfReader
14
+
15
+ # -------------------------------
16
+ # Small helpers
17
+ # -------------------------------
18
+
19
+ def _to_path(file_or_path: Union[str, bytes, os.PathLike, io.BufferedIOBase]) -> Tuple[str, Optional[str]]:
20
+ """
21
+ Ensure we have a filesystem path. If a file-like is provided, write it to a temp file.
22
+ Returns (path, tmp_path); tmp_path is None if no temp file was created.
23
+ """
24
+ if isinstance(file_or_path, (str, bytes, os.PathLike)):
25
+ return str(file_or_path), None
26
+ # file-like → persist to a temp file
27
+ suffix = ""
28
+ try:
29
+ name = getattr(file_or_path, "name", "")
30
+ if isinstance(name, str) and "." in name:
31
+ suffix = f".{name.rsplit('.', 1)[-1]}"
32
+ except Exception:
33
+ pass
34
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
35
+ try:
36
+ # rewind if possible
37
+ if hasattr(file_or_path, "seek"):
38
+ try: file_or_path.seek(0)
39
+ except Exception: pass
40
+ tmp.write(file_or_path.read())
41
+ finally:
42
+ tmp.flush()
43
+ tmp.close()
44
+ return tmp.name, tmp.name
45
+
46
+
47
+ def _cleanup_tmp(tmp_path: Optional[str]) -> None:
48
+ if tmp_path and os.path.exists(tmp_path):
49
+ try:
50
+ os.remove(tmp_path)
51
+ except Exception:
52
+ pass
53
+
54
+
55
+ # -------------------------------
56
+ # PDF Text Extraction (PyPDF2 fast path)
57
+ # -------------------------------
58
+
59
+ def extract_text_from_pdf(file_or_path) -> str:
60
+ """
61
+ Extract plain text from a (digital) PDF using PyPDF2.
62
+ Silent fallback (returns "") on failure to avoid polluting embeddings.
63
+ """
64
+ path, tmp = _to_path(file_or_path)
65
+ try:
66
+ reader = PdfReader(path)
67
+ pages_text = []
68
+ for page in reader.pages:
69
+ try:
70
+ t = page.extract_text()
71
+ if t:
72
+ pages_text.append(t)
73
+ except Exception:
74
+ continue
75
+ return "\n".join(pages_text).strip()
76
+ except Exception:
77
+ return ""
78
+ finally:
79
+ _cleanup_tmp(tmp)
80
+
81
+
82
+ # -------------------------------
83
+ # Image OCR
84
+ # -------------------------------
85
+
86
+ def extract_text_from_image(file_or_path, lang: str = "eng") -> str:
87
+ """
88
+ Basic OCR on an image (jpg/png). Handles file path or file-like.
89
+ """
90
+ path, tmp = _to_path(file_or_path)
91
+ try:
92
+ img = Image.open(path)
93
+ return (pytesseract.image_to_string(img, lang=lang) or "").strip()
94
+ except Exception:
95
+ return ""
96
+ finally:
97
+ _cleanup_tmp(tmp)
98
+
99
+
100
+ # -------------------------------
101
+ # Tables from PDF (Camelot stream)
102
+ # -------------------------------
103
+
104
+ def extract_tables_from_pdf(file_path: str) -> str:
105
+ """
106
+ Extract tables using Camelot (stream flavor).
107
+ Returns a single plain text block.
108
+ """
109
+ try:
110
+ import camelot
111
+ tables = camelot.read_pdf(file_path, pages="all", flavor="stream")
112
+ extracted = []
113
+ for i, tb in enumerate(tables):
114
+ rows = [" | ".join(map(str, row)) for row in tb.df.values.tolist()]
115
+ extracted.append(f"Table {i+1}:\n" + "\n".join(rows))
116
+ return "\n\n".join(extracted).strip() if extracted else ""
117
+ except Exception:
118
+ return ""
119
+
120
+
121
+ # -------------------------------
122
+ # Layout-aware text (pdfplumber)
123
+ # -------------------------------
124
+
125
+ def extract_layout_text(file_path: str) -> str:
126
+ """
127
+ Preserve headings/paragraphs via pdfplumber tolerances.
128
+ """
129
+ try:
130
+ import pdfplumber
131
+ parts = []
132
+ with pdfplumber.open(file_path) as pdf:
133
+ for page in pdf.pages:
134
+ txt = page.extract_text(x_tolerance=2, y_tolerance=2) or ""
135
+ if txt.strip():
136
+ parts.append(txt.strip())
137
+ return "\n\n".join(parts).strip()
138
+ except Exception:
139
+ return ""
140
+
141
+
142
+ # -------------------------------
143
+ # Chart / Graph OCR (OpenCV + Tesseract)
144
+ # -------------------------------
145
+
146
+ def extract_chart_text(image_input: Union[str, np.ndarray], lang: str = "eng") -> str:
147
+ """
148
+ Extract textual info from charts/graphs using a robust preprocessing pipeline:
149
+ - grayscale
150
+ - morphological tophat (remove background)
151
+ - adaptive threshold (handles light/dark themes)
152
+ - median denoise
153
+ - OCR with conservative psm
154
+ Accepts a file path or a BGR numpy array (OpenCV).
155
+ """
156
+ try:
157
+ if isinstance(image_input, str):
158
+ img = cv2.imread(image_input)
159
+ else:
160
+ img = image_input
161
+ if img is None:
162
+ return ""
163
+
164
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
165
+
166
+ # background suppression (tophat)
167
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9, 9))
168
+ tophat = cv2.morphologyEx(gray, cv2.MORPH_TOPHAT, kernel)
169
+
170
+ # adaptive threshold (robust to varying backgrounds)
171
+ thr = cv2.adaptiveThreshold(
172
+ tophat, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
173
+ cv2.THRESH_BINARY, 31, 15
174
+ )
175
+
176
+ # slight opening to remove specks, then median blur
177
+ opened = cv2.morphologyEx(thr, cv2.MORPH_OPEN, np.ones((2, 2), np.uint8))
178
+ denoised = cv2.medianBlur(opened, 3)
179
+
180
+ # Some charts have light text on dark bg → try inverted too and pick longer text
181
+ inverted = cv2.bitwise_not(denoised)
182
+
183
+ cfg = "--oem 3 --psm 6" # assume a block of text
184
+ txt1 = pytesseract.image_to_string(denoised, lang=lang, config=config_str(cfg))
185
+ txt2 = pytesseract.image_to_string(inverted, lang=lang, config=config_str(cfg))
186
+
187
+ text = (txt1 or "")
188
+ if len((txt2 or "").strip()) > len(text.strip()):
189
+ text = txt2
190
+
191
+ return text.strip()
192
+ except Exception:
193
+ return ""
194
+
195
+
196
+ def config_str(base: str) -> str:
197
+ """
198
+ Helper to make it obvious where to append tesseract configs later.
199
+ """
200
+ return base
201
+
202
+
203
+ # -------------------------------
204
+ # Image-Text correlation helper
205
+ # -------------------------------
206
+
207
+ def merge_image_with_caption(image_text: str, caption: str) -> str:
208
+ """
209
+ Combine OCR text + caption into a single blob for embedding.
210
+ """
211
+ image_text = (image_text or "").strip() or "No visible text"
212
+ caption = (caption or "").strip() or "No caption"
213
+ return f"Image Content: {image_text}\nCaption: {caption}"