MinerU

Paused

App Files Files Community

SkyNait commited on Feb 25

Commit

ee89119

1 Parent(s): aa23348

Topic extraction upgrades

Browse files

Files changed (7) hide show

__pycache__/inference_svm_model.cpython-310.pyc +0 -0
__pycache__/mineru_single.cpython-310.pyc +0 -0
__pycache__/topic_extraction_upgrade.cpython-310.pyc +0 -0
__pycache__/worker.cpython-310.pyc +0 -0
input_output/aqa-Mathematics-specification.pdf +3 -0
mineru_test_local.py +374 -0
topic_extraction_upgrade.py +423 -0

__pycache__/inference_svm_model.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ

__pycache__/mineru_single.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ

__pycache__/topic_extraction_upgrade.cpython-310.pyc ADDED Viewed

Binary file (10.9 kB). View file

__pycache__/worker.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ

input_output/aqa-Mathematics-specification.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d2a3998a4988ef6881262f22660a8e0719bb8d648a757db91041cfabbf40bb3
+size 888895

mineru_test_local.py ADDED Viewed

	@@ -0,0 +1,374 @@

+#!/usr/bin/env python3
+import os
+import re
+import gc
+from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+import json
+import base64
+import logging
+import concurrent.futures
+from io import BytesIO
+from google import genai
+from google.genai import types
+import torch
+import cv2
+from inference_svm_model import SVMModel
+from topic_extraction_upgrade import TableExtractor
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler('mineru.log')
+    ]
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+def call_gemini_for_table_classification(image_data: bytes) -> str:
+    """
+    Returns one of: "TWO_COLUMN", "THREE_COLUMN", or "NO_TABLE".
+    """
+    if genai is None or types is None:
+        logger.warning("Gemini libraries not available. Defaulting to NO_TABLE.")
+        return "NO_TABLE"
+    prompt = """You are given an image. Determine if it shows a relevant table that has exactly 2 or 3 columns.
+The 'relevant' table examples are the first and second reference images. The third reference image is irrelevant.
+If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
+If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
+If the image does not show a relevant table with 2 or 3 columns, respond with 'NO_TABLE'.
+Return only one of these exact labels as your entire response:
+TWO_COLUMN
+THREE_COLUMN
+NO_TABLE
+"""
+    try:
+        client = genai.Client(api_key="YOUR_GEMINI_API_KEY")  # Provide your real API key
+        response = client.models.generate_content(
+            model="gemini-2.0-flash",
+            config=types.GenerateContentConfig(temperature=0.),
+            contents=[
+                {
+                    "parts": [
+                        {"text": prompt},
+                        {
+                            "inline_data": {
+                                "mime_type": "image/jpeg",
+                                "data": base64.b64encode(image_data).decode('utf-8')
+                            }
+                        }
+                    ]
+                }
+            ]
+        )
+        classification = response.text.strip() if (response and response.text) else "NO_TABLE"
+        classification = classification.upper()
+        if "THREE" in classification:
+            return "THREE_COLUMN"
+        elif "TWO" in classification:
+            return "TWO_COLUMN"
+        else:
+            return "NO_TABLE"
+    except Exception as e:
+        logger.error(f"[Gemini Table Classification Error]: {str(e)}")
+        return "NO_TABLE"
+def call_gemini_for_image_description(image_data: bytes) -> str:
+    if genai is None or types is None:
+        logger.warning("Gemini libraries not available. Returning fallback description.")
+        return "Image description unavailable"
+    try:
+        client = genai.Client(api_key="AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")  # Provide your real API key
+        response = client.models.generate_content(
+            model="gemini-2.0-flash",
+            config=types.GenerateContentConfig(temperature=0.),
+            contents=[
+                {
+                    "parts": [
+                        {
+                            "text": """The provided image is a part of a question paper or markscheme.
+                                    Extract all the necessary information from the image to be able to identify the question.
+                                    To identify the question, we only need the following: question number and question part.
+                                    Don't include redundant information.
+                                    For example, if image contains text like: "Q1 Part A Answer: Life on earth was created by diety..."
+                                    you should return just "Q1 Part A Mark Scheme"
+                                    If there is no text on this image, return the description of the image. 20 words max.
+                                    If there are not enough data, consider information from the surrounding context.
+                                    Additionally, if the image contains a truncated part, you must describe it and mark as a
+                                    part of some another image that goes before or after current image.
+                                    If the image is of a multiple-choice question’s options, then modify your answer by appending
+                                    'MCQ: A [option] B [option] C [option] D [option]' (replacing [option] with the actual options).
+                                    Otherwise, follow the above instructions strictly.
+                        """},
+                        {
+                            "inline_data": {
+                                "mime_type": "image/jpeg",
+                                "data": base64.b64encode(image_data).decode('utf-8')
+                            }
+                        }
+                    ]
+                }
+            ]
+        )
+        description = response.text.strip() if (response and response.text) else "Image description unavailable"
+        return description
+    except Exception as e:
+        logger.error(f"[Gemini Description Error]: {str(e)}")
+        return "Image description unavailable"
+class DataWriter:
+    """
+    Base class for handling extracted images.
+    """
+    def write(self, path: str, data: bytes) -> None:
+        raise NotImplementedError
+    def post_process(self, key: str, md_content: str) -> str:
+        raise NotImplementedError
+class LocalImageWriter(DataWriter):
+    """
+    Stores extracted images locally so they can be referenced in local Markdown previews.
+    SVM filters out blank images. Then we do Gemini classification for table detection or normal description.
+    Finally, we rewrite the Markdown to reference these local images.
+    """
+    def __init__(self, output_folder: str, svm_model: SVMModel):
+        """
+        :param output_folder: Base folder where images and final MD will be saved.
+        :param svm_model: SVM model for blank image detection.
+        """
+        self.output_folder = output_folder
+        self.svm_model = svm_model
+        self.descriptions = {}
+        """
+        self.descriptions structure:
+        {
+            "{local_id_or_path}": {
+                "data": bytes,
+                "relative_path": str,  # relative path to the saved image
+                "description": "",     # gemini description
+                "table_classification": "TWO_COLUMN" / "THREE_COLUMN" / "NO_TABLE"
+                "final_alt": ""        # final alt text for the MD
+            }
+        }
+        """
+        os.makedirs(self.output_folder, exist_ok=True)
+        self.images_dir = os.path.join(self.output_folder, "images")
+        os.makedirs(self.images_dir, exist_ok=True)
+        self._img_count = 0
+    def write(self, path: str, data: bytes) -> None:
+        """
+        1) Use SVM to check if blank/irrelevant.
+        2) If not blank, save the image locally (images/img_{count}.png).
+        3) Keep track in self.descriptions for post-process usage.
+        """
+        is_blank = self.svm_model.is_blank_image(data)
+        if is_blank:
+            logger.info(f"[SVM] Detected blank/irrelevant image: {path}. Skipping.")
+            return
+        self._img_count += 1
+        # Example local path
+        local_filename = f"img_{self._img_count}.png"
+        local_path = os.path.join(self.images_dir, local_filename)
+        with open(local_path, "wb") as f:
+            f.write(data)
+        rel_path_for_md = os.path.relpath(local_path, self.output_folder)
+        self.descriptions[path] = {
+            "data": data,
+            "relative_path": rel_path_for_md,  # e.g. "images/img_1.png"
+            "description": "",
+            "table_classification": "NO_TABLE",
+            "final_alt": ""
+        }
+    def post_process(self, key: str, md_content: str) -> str:
+        """
+        1) Gemini classification (table vs no_table).
+        2) If table => alt = "HAS TO BE PROCESSED - two/three column table".
+           Else => normal Gemini-based description.
+        3) Replace all ![]({key}{path}) with ![final_alt](relative_local_path).
+        4) For any "HAS TO BE PROCESSED" images, run TableExtractor.
+        """
+        # Step A: Table classification
+        logger.info("Starting Gemini table classification for each local image...")
+        if not self.descriptions:
+            return md_content
+        max_workers = len(self.descriptions)
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max(max_workers, 1)) as executor:
+            future_to_path = {
+                executor.submit(call_gemini_for_table_classification, info['data']): p
+                for p, info in self.descriptions.items()
+            }
+            for future in concurrent.futures.as_completed(future_to_path):
+                path = future_to_path[future]
+                try:
+                    classification = future.result()
+                    self.descriptions[path]['table_classification'] = classification
+                except Exception as e:
+                    logger.error(f"[Gemini Table Classification Error for {path}]: {str(e)}")
+                    self.descriptions[path]['table_classification'] = "NO_TABLE"
+        # Step B: For images that are "NO_TABLE", we do normal gemini-based description
+        logger.info("Starting Gemini question-based description for non-table images...")
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max(max_workers, 1)) as executor:
+            fut_map = {}
+            for path, info in self.descriptions.items():
+                if info['table_classification'] == "NO_TABLE":
+                    fut = executor.submit(call_gemini_for_image_description, info['data'])
+                    fut_map[fut] = path
+            for fut in concurrent.futures.as_completed(fut_map):
+                path = fut_map[fut]
+                try:
+                    desc = fut.result()
+                    self.descriptions[path]['description'] = desc
+                except Exception as e:
+                    logger.error(f"[Gemini Description Error for {path}]: {str(e)}")
+                    self.descriptions[path]['description'] = "Image description unavailable"
+        # Step C: Construct final alt text
+        for path, info in self.descriptions.items():
+            classification = info['table_classification']
+            if classification == "TWO_COLUMN":
+                final_alt = "HAS TO BE PROCESSED - two column table"
+            elif classification == "THREE_COLUMN":
+                final_alt = "HAS TO BE PROCESSED - three column table"
+            else:
+                # normal gemini-based description
+                final_alt = info['description'] or "Image description unavailable"
+            info['final_alt'] = final_alt
+        for path, info in self.descriptions.items():
+            old_md_tag = f"![]({key}{path})"
+            new_md_tag = f"![{info['final_alt']}]({info['relative_path']})"
+            md_content = md_content.replace(old_md_tag, new_md_tag)
+        md_content = self._process_table_images_in_markdown(md_content)
+        return md_content
+    def _process_table_images_in_markdown(self, md_content: str) -> str:
+        """
+        Finds images with alt text like:
+          ![HAS TO BE PROCESSED - (two|three) column table](images/img_1.png)
+        Then runs TableExtractor with specific params for two/three columns.
+        Saves each cell as a separate image in a subfolder next to the original.
+        """
+        pattern = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
+        matches = re.findall(pattern, md_content, flags=re.IGNORECASE)
+        if not matches:
+            return md_content
+        for (col_type, image_path) in matches:
+            logger.info(f"Detected table image in MD: {image_path}, columns={col_type}")
+            # Convert image_path to absolute path
+            abs_image_path = os.path.join(self.output_folder, image_path)
+            try:
+                if col_type.lower() == 'two':
+                    # For two-column tables
+                    extractor = TableExtractor(
+                        merge_two_col_rows=True,
+                        enable_subtopic_merge=True
+                    )
+                else:
+                    # For three-column tables
+                    extractor = TableExtractor(
+                        merge_two_col_rows=False,
+                        enable_subtopic_merge=False
+                    )
+                row_boxes = extractor.process_image(abs_image_path)
+                out_folder = abs_image_path + "_rows"
+                os.makedirs(out_folder, exist_ok=True)
+                extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
+                logger.info(f"Table extraction done for {image_path}, saved to {out_folder}")
+            except Exception as e:
+                logger.error(f"Error processing table image {image_path}: {e}")
+        return md_content
+class LocalPDFProcessor:
+    def __init__(self, output_folder: str):
+        self.output_folder = output_folder
+        os.makedirs(self.output_folder, exist_ok=True)
+        self.svm_model = SVMModel()
+        logger.info("Classification (SVM) model initialized successfully")
+        self.layout_mode = "layoutlmv3"
+        self.ocr_enable = False
+        self.formula_enable = True
+        self.table_enable = False
+        self.language = "en"
+        logger.info("LocalPDFProcessor initialized successfully")
+    def cleanup_gpu(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        logger.info("GPU memory cleaned up.")
+    def process(self, pdf_path: str) -> str:
+        logger.info(f"Processing local PDF: {pdf_path}")
+        try:
+            # Read PDF bytes
+            with open(pdf_path, "rb") as f:
+                pdf_bytes = f.read()
+            dataset = PymuDocDataset(pdf_bytes)
+            inference = doc_analyze(
+                dataset,
+                ocr=self.ocr_enable,
+                lang=self.language,
+                layout_model=self.layout_mode,
+                formula_enable=self.formula_enable,
+                table_enable=self.table_enable
+            )
+            logger.info("doc_analyze complete. Extracting images...")
+            image_writer = LocalImageWriter(self.output_folder, self.svm_model)
+            pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
+            logger.info("Image pipeline completed. Generating markdown...")
+            md_content = pipe_result.get_markdown("local-unique-prefix/")
+            final_markdown = image_writer.post_process("local-unique-prefix/", md_content)
+            # Save final .md file
+            md_path = os.path.join(self.output_folder, "final_output.md")
+            with open(md_path, "w", encoding="utf-8") as f:
+                f.write(final_markdown)
+            logger.info(f"Markdown saved to: {md_path}")
+            return final_markdown
+        finally:
+            self.cleanup_gpu()
+if __name__ == "__main__":
+    input_pdf = "/home/user/app/input_output/aqa-Mathematics-specification.pdf"
+    output_dir = "/home/user/app/input_output/output"
+    processor = LocalPDFProcessor(output_folder=output_dir)
+    md_result = processor.process(input_pdf)
+    # print("Final Markdown:\n", md_result)

topic_extraction_upgrade.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import cv2
+import numpy as np
+import logging
+from pathlib import Path
+from typing import List, Tuple
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# if you are working with 3-column tables, change `merge_two_col_rows` and `enable_subtopic_merge` to False
+# otherwise set them to True if you are working with 2-column tables  (currently hardcoded, just test)
+class TableExtractor:
+    def __init__(
+            self,
+            #preprocessing parameters
+            denoise_h: int = 10,
+            clahe_clip: float = 3.0,
+            clahe_grid: int = 8,
+            sharpen_kernel: np.ndarray = np.array([[-1, -1, -1],
+                                                   [-1,  9, -1],
+                                                   [-1, -1, -1]]),
+            thresh_block_size: int = 21,
+            thresh_C: int = 7,
+            # Row detection parameters
+            horizontal_scale: int = 20,
+            row_morph_iterations: int = 2,
+            min_row_height: int = 30,
+            min_row_density: float = 0.01,
+            # Column detection parameters
+            vertical_scale: int = 20,
+            col_morph_iterations: int = 2,
+            min_col_height_ratio: float = 0.5,
+            min_col_density: float = 0.01,
+            # Bounding box extraction
+            padding: int = 0,
+            skip_header: bool = True,
+            # Two-column & subtopic merges
+            merge_two_col_rows: bool = False,
+            enable_subtopic_merge: bool = False,
+            subtopic_threshold: float = 0.2,
+            #gray artifact filter
+            std_threshold_for_artifacts: float = 5.0,
+            #parameters for line removal check
+            line_removal_scale: int = 15,
+            line_removal_iterations: int = 1,
+            min_text_ratio_after_line_removal: float = 0.001
+    ):
+        """
+        :param merge_two_col_rows: If True, a row with exactly 1 vertical line => merges into 1 bounding box.
+        :param enable_subtopic_merge: If True, a row with 2 vertical lines => 3 columns can become 2 if left is narrow.
+        :param subtopic_threshold: Fraction of row width for subtopic detection.
+        :param std_threshold_for_artifacts: Grayscale std dev < this => skip as artifact.
+        :param line_removal_scale: Larger => more aggressive line detection inside the cell.
+        :param line_removal_iterations: Morphological iterations for line removal.
+        :param min_text_ratio_after_line_removal: If fraction of text after removing lines < this => skip cell.
+        """
+        # Preprocessing
+        self.denoise_h = denoise_h
+        self.clahe_clip = clahe_clip
+        self.clahe_grid = clahe_grid
+        self.sharpen_kernel = sharpen_kernel
+        self.thresh_block_size = thresh_block_size
+        self.thresh_C = thresh_C
+        # Row detection
+        self.horizontal_scale = horizontal_scale
+        self.row_morph_iterations = row_morph_iterations
+        self.min_row_height = min_row_height
+        self.min_row_density = min_row_density
+        # Column detection
+        self.vertical_scale = vertical_scale
+        self.col_morph_iterations = col_morph_iterations
+        self.min_col_height_ratio = min_col_height_ratio
+        self.min_col_density = min_col_density
+        # Bbox extraction
+        self.padding = padding
+        self.skip_header = skip_header
+        # Two-column / subtopic merges
+        self.merge_two_col_rows = merge_two_col_rows
+        self.enable_subtopic_merge = enable_subtopic_merge
+        self.subtopic_threshold = subtopic_threshold
+        #artifact filtering (gray headers, purple, etc) / currenty not working well
+        self.std_threshold_for_artifacts = std_threshold_for_artifacts
+        #line removal inside cell
+        self.line_removal_scale = line_removal_scale
+        self.line_removal_iterations = line_removal_iterations
+        self.min_text_ratio_after_line_removal = min_text_ratio_after_line_removal
+    def preprocess(self, img: np.ndarray) -> np.ndarray:
+        """Grayscale, denoise, CLAHE, sharpen, adaptive threshold (binary_inv)."""
+        if img.ndim == 3:
+            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = img.copy()
+        denoised = cv2.fastNlMeansDenoising(gray, h=self.denoise_h)
+        clahe = cv2.createCLAHE(clipLimit=self.clahe_clip, tileGridSize=(self.clahe_grid, self.clahe_grid))
+        enhanced = clahe.apply(denoised)
+        sharpened = cv2.filter2D(enhanced, -1, self.sharpen_kernel)
+        binarized = cv2.adaptiveThreshold(
+            sharpened, 255,
+            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY_INV,
+            self.thresh_block_size,
+            self.thresh_C
+        )
+        return binarized
+    def detect_full_rows(self, bin_img: np.ndarray) -> List[Tuple[int, int]]:
+        """Find horizontal row boundaries in the binarized image."""
+        h_kernel_size = max(1, bin_img.shape[1] // self.horizontal_scale)
+        horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (h_kernel_size, 1))
+        horizontal_lines = cv2.morphologyEx(bin_img, cv2.MORPH_OPEN, horizontal_kernel,
+                                            iterations=self.row_morph_iterations)
+        row_projection = np.sum(horizontal_lines, axis=1)
+        max_val = np.max(row_projection) if len(row_projection) else 0
+        # If no lines, treat entire image as one row (opt)
+        if max_val < 1e-5:
+            return [(0, bin_img.shape[0])]
+        threshold_val = 0.3 * max_val
+        line_indices = np.where(row_projection > threshold_val)[0]
+        if len(line_indices) < 2:
+            return [(0, bin_img.shape[0])]
+        # Group consecutive indices
+        lines = []
+        current = [line_indices[0]]
+        for i in range(1, len(line_indices)):
+            if line_indices[i] - line_indices[i - 1] <= 2:
+                current.append(line_indices[i])
+            else:
+                lines.append(int(np.mean(current)))
+                current = [line_indices[i]]
+        if current:
+            lines.append(int(np.mean(current)))
+        row_bounds = []
+        for i in range(len(lines) - 1):
+            y1 = lines[i]
+            y2 = lines[i + 1]
+            if (y2 - y1) >= self.min_row_height:
+                row_bounds.append((y1, y2))
+        return row_bounds if row_bounds else [(0, bin_img.shape[0])]
+    def detect_columns_in_row(self, row_img: np.ndarray, y1: int, y2: int) -> List[Tuple[int, int, int, int]]:
+        """
+        Detect up to two vertical lines => up to 3 bounding boxes.
+         - 0 lines => 1 bounding box
+         - 1 line => 2 bounding boxes (unless merge_two_col_rows => 1)
+         - 2 lines => 3 bounding boxes by default
+                      if enable_subtopic_merge => check left box < subtopic_threshold => 2 boxes
+        """
+        row_height = (y2 - y1)
+        row_width = row_img.shape[1]
+        # Morph kernel for vertical lines
+        v_kernel_size = max(1, row_height // self.vertical_scale)
+        vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, v_kernel_size))
+        vertical_lines = cv2.morphologyEx(row_img, cv2.MORPH_OPEN, vertical_kernel,
+                                          iterations=self.col_morph_iterations)
+        vertical_lines = cv2.dilate(vertical_lines, np.ones((3, 3), np.uint8), iterations=1)
+        # Find contours => x positions
+        contours, _ = cv2.findContours(vertical_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        x_positions = []
+        for c in contours:
+            x, y, w, h = cv2.boundingRect(c)
+            # Must be at least half the row height to be considered a real column divider
+            if h >= self.min_col_height_ratio * row_height:
+                x_positions.append(x)
+        x_positions = sorted(set(x_positions))
+        # Keep at most 2 vertical lines
+        if len(x_positions) > 2:
+            x_positions = x_positions[:2]
+        # Build bounding boxes
+        if len(x_positions) == 0:
+            # 0 lines => single bounding box
+            boxes = [(0, y1, row_width, row_height)]
+        elif len(x_positions) == 1:
+            # 1 line => 2 bounding boxes by default
+            x1 = x_positions[0]
+            if self.merge_two_col_rows:
+                # Merge => single bounding box
+                boxes = [(0, y1, row_width, row_height)]
+            else:
+                boxes = [
+                    (0,    y1, x1,             row_height),
+                    (x1,   y1, row_width - x1, row_height)
+                ]
+        else:
+            # 2 lines => normally 3 bounding boxes
+            x1, x2 = sorted(x_positions)
+            if self.enable_subtopic_merge:
+                # If left bounding box is very narrow => treat as subtopic => 2 bounding boxes
+                left_box_width = x1
+                if left_box_width < (self.subtopic_threshold * row_width):
+                    boxes = [
+                        (0,  y1, x1,             row_height),
+                        (x1, y1, row_width - x1, row_height)
+                    ]
+                else:
+                    boxes = [
+                        (0,  y1, x1,             row_height),
+                        (x1, y1, x2 - x1,        row_height),
+                        (x2, y1, row_width - x2, row_height)
+                    ]
+            else:
+                boxes = [
+                    (0,  y1, x1,             row_height),
+                    (x1, y1, x2 - x1,        row_height),
+                    (x2, y1, row_width - x2, row_height)
+                ]
+        # Filter out columns with insufficient density
+        filtered = []
+        for (x, y, w, h) in boxes:
+            if w <= 0:
+                continue
+            subregion = row_img[:, x : x + w]
+            white_pixels = np.sum(subregion == 255)
+            total_pixels = subregion.size
+            if total_pixels == 0:
+                continue
+            density = white_pixels / total_pixels
+            if density >= self.min_col_density:
+                filtered.append((x, y, w, h))
+        return filtered
+    def process_image(self, image_path: str) -> List[List[Tuple[int, int, int, int]]]:
+        """
+        1) Preprocess => bin_img
+        2) Detect row segments
+        3) Filter out rows by density
+            - optionally skip first row (header)
+        5) For each row => detect columns => bounding boxes
+        """
+        img = cv2.imread(image_path)
+        if img is None:
+            raise ValueError(f"Could not read image: {image_path}")
+        bin_img = self.preprocess(img)
+        row_segments = self.detect_full_rows(bin_img)
+        # Filter out rows with insufficient density
+        valid_rows = []
+        for (y1, y2) in row_segments:
+            row_region = bin_img[y1:y2, :]
+            area = row_region.size
+            if area == 0:
+                continue
+            white_pixels = np.sum(row_region == 255)
+            density = white_pixels / area
+            if density >= self.min_row_density:
+                valid_rows.append((y1, y2))
+        # Possibly skip header row
+        if self.skip_header and len(valid_rows) > 1:
+            valid_rows = valid_rows[1:]
+        # Detect columns in each row
+        all_rows_boxes = []
+        for (y1, y2) in valid_rows:
+            row_img = bin_img[y1:y2, :]
+            col_boxes = self.detect_columns_in_row(row_img, y1, y2)
+            if col_boxes:
+                all_rows_boxes.append(col_boxes)
+        return all_rows_boxes
+    def extract_box_image(self, original: np.ndarray, box: Tuple[int, int, int, int]) -> np.ndarray:
+        """Crop bounding box from original with optional padding."""
+        x, y, w, h = box
+        Y1 = max(0, y - self.padding)
+        Y2 = min(original.shape[0], y + h + self.padding)
+        X1 = max(0, x - self.padding)
+        X2 = min(original.shape[1], x + w + self.padding)
+        return original[Y1:Y2, X1:X2]
+    def _remove_lines_in_cell(self, gray_bin: np.ndarray) -> np.ndarray:
+        """
+        Remove horizontal + vertical lines from a binarized subregion
+        and return the 'text-only' mask.
+        """
+        # 1) horizontal line detection
+        horiz_kernel_size = max(1, gray_bin.shape[1] // self.line_removal_scale)
+        horiz_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (horiz_kernel_size, 1))
+        horizontal = cv2.morphologyEx(gray_bin, cv2.MORPH_OPEN, horiz_kernel, iterations=self.line_removal_iterations)
+        # 2) vertical line detection
+        vert_kernel_size = max(1, gray_bin.shape[0] // self.line_removal_scale)
+        vert_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vert_kernel_size))
+        vertical = cv2.morphologyEx(gray_bin, cv2.MORPH_OPEN, vert_kernel, iterations=self.line_removal_iterations)
+        # Combine lines
+        lines = cv2.bitwise_or(horizontal, vertical)
+        # Subtract from the original => text-only
+        text_only = cv2.bitwise_and(gray_bin, cv2.bitwise_not(lines))
+        return text_only
+    def is_grey_artifact(self, cell_img: np.ndarray) -> bool:
+        """
+        1) If grayscale std dev < std_threshold_for_artifacts => skip as uniform.
+        2) Otherwise, remove lines from an Otsu-binarized version of the cell
+           and check if there's enough text left. If not, skip as artifact.
+        """
+        if cell_img.size == 0:
+            return True
+        gray = cv2.cvtColor(cell_img, cv2.COLOR_BGR2GRAY)
+        std_val = np.std(gray)
+        if std_val < self.std_threshold_for_artifacts:
+            return True
+        # 2) Binarize => remove lines => check leftover text
+        #    Use Otsu threshold for the local cell
+        _, cell_bin = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
+        text_only = self._remove_lines_in_cell(cell_bin)
+        nonzero_text = cv2.countNonZero(text_only)
+        ratio = nonzero_text / float(cell_bin.size)
+        if ratio < self.min_text_ratio_after_line_removal:
+            # Hardly any text remains => artifact
+            return True
+        return False
+    def save_extracted_cells(
+            self, image_path: str, row_boxes: List[List[Tuple[int, int, int, int]]], output_dir: str
+    ):
+        """Save each cell from the original image, skipping uniform/gray artifacts."""
+        out_path = Path(output_dir)
+        out_path.mkdir(exist_ok=True, parents=True)
+        original = cv2.imread(image_path)
+        if original is None:
+            raise ValueError(f"Could not read original image: {image_path}")
+        for i, row in enumerate(row_boxes):
+            row_dir = out_path / f"row_{i}"
+            row_dir.mkdir(exist_ok=True)
+            for j, box in enumerate(row):
+                cell_img = self.extract_box_image(original, box)
+                # Skip if uniform or line-only artifact
+                if self.is_grey_artifact(cell_img):
+                    logger.info(f"Skipping artifact cell at row={i}, col={j}. (uniform/grey/line-only)")
+                    continue
+                out_file = row_dir / f"col_{j}.png"
+                cv2.imwrite(str(out_file), cell_img)
+                logger.info(f"Saved cell image row={i}, col={j} -> {out_file}")
+class TableExtractorApp:
+    def __init__(self, extractor: TableExtractor):
+        self.extractor = extractor
+    def run(self, input_image: str, output_folder: str):
+        row_boxes = self.extractor.process_image(input_image)
+        logger.info(f"Detected {len(row_boxes)} row(s).")
+        self.extractor.save_extracted_cells(input_image, row_boxes, output_folder)
+        logger.info("Done. Check the output folder for results.")
+if __name__ == "__main__":
+    input_image = "images/test/img_2.png"
+    output_folder = "refined_outp"
+    extractor = TableExtractor(
+        denoise_h=10,
+        clahe_clip=3.0,
+        clahe_grid=8,
+        thresh_block_size=21,
+        thresh_C=7,
+        horizontal_scale=20,
+        row_morph_iterations=2,
+        min_row_height=30,
+        min_row_density=0.01,
+        vertical_scale=20,
+        col_morph_iterations=2,
+        min_col_height_ratio=0.5,
+        min_col_density=0.01,
+        padding=1,
+        skip_header=True,
+        merge_two_col_rows=True,
+        enable_subtopic_merge=True,
+        subtopic_threshold=0.2,
+        std_threshold_for_artifacts=10.0,
+        line_removal_scale=20,
+        line_removal_iterations=1,
+        min_text_ratio_after_line_removal=0.001
+    )
+    app = TableExtractorApp(extractor)
+    app.run(input_image, output_folder)