Spaces:

Duplicated from opendatalab/MinerU

Quextro
/

MinerU

Paused

App Files Files Community

SkyNait commited on Feb 27

Commit

e06f439

·

1 Parent(s): b7d667b

correct page range handling

Files changed (33) hide show

__pycache__/inference_svm_model.cpython-310.pyc +0 -0
__pycache__/mineru_single.cpython-310.pyc +0 -0
__pycache__/table_row_extraction.cpython-310.pyc +0 -0
__pycache__/worker.cpython-310.pyc +0 -0
output/images/img_1.png +0 -0
output/images/img_10.png +0 -0
output/images/img_11.png +0 -0
output/images/img_12.png +0 -0
output/images/img_13.png +0 -0
output/images/img_14.png +0 -0
output/images/img_15.png +0 -0
output/images/img_16.png +0 -0
output/images/img_17.png +0 -0
output/images/img_18.png +0 -0
output/images/img_19.png +0 -0
output/images/img_2.png +0 -0
output/images/img_20.png +0 -0
output/images/img_21.png +0 -0
output/images/img_22.png +0 -0
output/images/img_23.png +0 -0
output/images/img_24.png +0 -0
output/images/img_25.png +0 -0
output/images/img_26.png +0 -0
output/images/img_27.png +0 -0
output/images/img_28.png +0 -0
output/images/img_3.png +0 -0
output/images/img_4.png +0 -0
output/images/img_5.png +0 -0
output/images/img_6.png +0 -0
output/images/img_7.png +0 -0
output/images/img_8.png +0 -0
output/images/img_9.png +0 -0
topic_extr.py +350 -309

__pycache__/inference_svm_model.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ

__pycache__/mineru_single.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ

__pycache__/table_row_extraction.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/table_row_extraction.cpython-310.pyc and b/__pycache__/table_row_extraction.cpython-310.pyc differ

__pycache__/worker.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ

output/images/img_1.png ADDED Viewed

output/images/img_10.png ADDED Viewed

output/images/img_11.png ADDED Viewed

output/images/img_12.png ADDED Viewed

output/images/img_13.png ADDED Viewed

output/images/img_14.png ADDED Viewed

output/images/img_15.png ADDED Viewed

output/images/img_16.png ADDED Viewed

output/images/img_17.png ADDED Viewed

output/images/img_18.png ADDED Viewed

output/images/img_19.png ADDED Viewed

output/images/img_2.png ADDED Viewed

output/images/img_20.png ADDED Viewed

output/images/img_21.png ADDED Viewed

output/images/img_22.png ADDED Viewed

output/images/img_23.png ADDED Viewed

output/images/img_24.png ADDED Viewed

output/images/img_25.png ADDED Viewed

output/images/img_26.png ADDED Viewed

output/images/img_27.png ADDED Viewed

output/images/img_28.png ADDED Viewed

output/images/img_3.png ADDED Viewed

output/images/img_4.png ADDED Viewed

output/images/img_5.png ADDED Viewed

output/images/img_6.png ADDED Viewed

output/images/img_7.png ADDED Viewed

output/images/img_8.png ADDED Viewed

output/images/img_9.png ADDED Viewed

topic_extr.py CHANGED Viewed

@@ -6,95 +6,119 @@ import json
 import logging
 import fitz
 import base64
-import cv2
-import numpy as np
 from io import BytesIO
 from typing import List, Dict, Any
-import concurrent.futures
 import torch
-from google import genai
-from google.genai import types
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from table_row_extraction import TableExtractor
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
     """
-    Creates a new PDF (in memory) containing only the pages in page_indices (0-based).
     """
     if not page_indices:
         raise ValueError("No page indices provided for subset creation.")
     doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
     new_doc = fitz.open()
-    sorted_pages = sorted(set(page_indices))
-    for p in sorted_pages:
         if 0 <= p < doc.page_count:
             new_doc.insert_pdf(doc, from_page=p, to_page=p)
         else:
-            logger.error(f"Page index {p} is out of range (0..{doc.page_count - 1}).")
-            raise ValueError(f"Page index {p} is out of range.")
     subset_bytes = new_doc.tobytes()
     new_doc.close()
     doc.close()
     return subset_bytes
-def shrink_image_to_jpeg(image_data: bytes, max_dim: int = 800, jpeg_quality: int = 80) -> bytes:
     """
-    Decode image_data, resize so largest dimension <= max_dim, then re-encode as JPEG.
-    This reduces request size to Gemini significantly.
     """
-    try:
-        arr = np.frombuffer(image_data, np.uint8)
-        img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
-        if img is None:
-            # Not a valid image, return as-is
-            return image_data
-        h, w, _ = img.shape
-        scale = 1.0
-        if max(h, w) > max_dim:
-            scale = max_dim / float(max(h, w))
-        if scale < 1.0:
-            new_w = int(w * scale)
-            new_h = int(h * scale)
-            img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
-        encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), jpeg_quality]
-        success, enc = cv2.imencode(".jpg", img, encode_params)
-        if success:
-            return enc.tobytes()
-        else:
-            logger.warning("Could not encode resized image, returning original.")
-            return image_data
-    except Exception as e:
-        logger.warning(f"shrink_image_to_jpeg error: {e}. Returning original data.")
-        return image_data
 class GeminiTopicExtractor:
     """
-    Reads the first few pages of a PDF to get the table of contents text,
-    then uses Gemini to parse out topics -> [start_page, end_page].
     """
-    def __init__(self, api_key: str = None, num_pages: int = 15):
         self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
         if not self.api_key:
-            logger.warning("No Gemini API key provided for subtopic extraction.")
         self.num_pages = num_pages
-    def extract_subtopics(self, pdf_path: str) -> Dict[str, Any]:
-        text_content = self._read_first_pages(pdf_path, self.num_pages)
-        if not text_content.strip():
-            logger.error("No text extracted from the first pages of the PDF.")
             return {}
         if genai is None or types is None:
@@ -102,100 +126,198 @@ class GeminiTopicExtractor:
             return {}
         prompt = f"""
-You will be provided with the first pages of an exam board document.
-Your goal is to extract the main subject-related topics from the \"Contents\" section
-and structure them in a valid JSON format.
 Instructions:
-    1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
-    2. Extract only the **highest-level, subject-related subtopics** (ignore administrative sections).
-    3. For each subtopic, return [start_page, end_page] (1-based).
-    4. Output valid JSON in the following format:
-       {{
-         "Topic A": [start_page, end_page],
-         "Topic B": [start_page, end_page]
-       }}
-Important Notes:
-- Ignore non-subject-related sections (e.g., 'Introduction', 'Exam Guidelines', 'Appendices',
-  'Assessment, Qualification at a glance').
-- The extracted subtopics should represent major academic areas, not organizational or structural elements.
-- Ignore including the main topic page as start, ONLY subtopic first page.
-- Make sure that all of the pages for a subtopic are included; the end page should be (the start page of the
-  next topic) - 1.
-Now, extract topics from this text: {text_content}
 """
         try:
-            logger.debug("Calling Gemini to extract subtopics...")
             client = genai.Client(api_key=self.api_key)
             response = client.models.generate_content(
                 model="gemini-2.0-flash",
                 contents=[prompt],
                 config=types.GenerateContentConfig(temperature=0.0)
             )
-            # Log partial or full LLM response for debugging
-            if response and response.text:
-                logger.info(f"[Gemini subtopic extraction] Raw LLM response:\n{response.text}")
-            raw_text = response.text.strip() if (response and response.text) else "{}"
-            cleaned = raw_text.replace("```json", "").replace("```", "")
             data = json.loads(cleaned)
-            return data
         except Exception as e:
             logger.error(f"Gemini subtopic extraction error: {e}")
             return {}
-    def _read_first_pages(self, pdf_path: str, num_pages: int) -> str:
         text_parts = []
         try:
             doc = fitz.open(pdf_path)
-            pages_to_read = min(doc.page_count, num_pages)
-            for p in range(pages_to_read):
-                page_text = doc.load_page(p).get_text()
-                text_parts.append(page_text)
             doc.close()
         except Exception as e:
-            logger.error(f"Could not open/read PDF: {e}")
         return "\n".join(text_parts)
 def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str:
     """
-    Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE using Gemini (Flash).
-    We shrink the image first to speed up requests.
     """
     if not api_key:
-        logger.warning("No Gemini API key found, returning NO_TABLE.")
         return "NO_TABLE"
-    if not genai or not types:
-        logger.warning("google.genai not installed, returning NO_TABLE.")
         return "NO_TABLE"
-    shrunk_data = shrink_image_to_jpeg(image_data, max_dim=800, jpeg_quality=80)
-    prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
-The three-column 'table' image include such key features:
-    - Three columns header columns
-    - Headers like 'Topics', 'Content', 'Guidelines'
-    - Numbered sections (e.g., 8.4, 9.1)
-    - Educational curriculum-style structure
-The two-column 'table' image include such key features:
-    - Two columns header columns
-    - Headers like 'Subject content' and 'Additional information'
-    - Numbered sections (e.g., 2.1, 3.4)
-    - Educational curriculum-style structure
-    - Bullet description in 'Additional information'
-If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
-If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
-If the image does not show a table at all, respond with 'NO_TABLE'.
-Return only one of these exact labels as your entire response:
 TWO_COLUMN
 THREE_COLUMN
 NO_TABLE
 """
     try:
         client = genai.Client(api_key=api_key)
-        response = client.models.generate_content(
             model="gemini-2.0-flash",
             contents=[
                 {
@@ -204,7 +326,7 @@ NO_TABLE
                         {
                             "inline_data": {
                                 "mime_type": "image/jpeg",
-                                "data": base64.b64encode(shrunk_data).decode('utf-8')
                             }
                         }
                     ]
@@ -212,82 +334,23 @@ NO_TABLE
             ],
             config=types.GenerateContentConfig(temperature=0.0)
         )
-        if response and response.text:
-            logger.info(f"[Gemini table classification] LLM raw response:\n{response.text}")
-        classification = (response.text.strip().upper()
-                          if (response and response.text) else "NO_TABLE")
-        if "THREE" in classification:
-            return "THREE_COLUMN"
-        elif "TWO" in classification:
-            return "TWO_COLUMN"
-        else:
-            return "NO_TABLE"
     except Exception as e:
         logger.error(f"Gemini table classification error: {e}")
         return "NO_TABLE"
-def call_gemini_for_image_description(image_data: bytes, api_key: str) -> str:
-    """
-    Use Gemini (Flash) to extract a short description from an image.
-    We also shrink the image first to reduce request time.
-    """
-    if not api_key:
-        logger.warning("No Gemini API key found, returning fallback description.")
-        return "Image description unavailable"
-    if not genai or not types:
-        logger.warning("google.genai not installed, returning fallback description.")
-        return "Image description unavailable"
-    shrunk_data = shrink_image_to_jpeg(image_data, max_dim=800, jpeg_quality=80)
-    prompt_text = """The provided image is a part of a question paper or markscheme.
-Extract all the necessary information from the image to be able to identify the question.
-To identify the question, we only need the following: question number and question part.
-Don't include redundant information.
-For example, if image contains text like: "Q1 Part A Answer: Life on earth was created by diety..."
-you should return just "Q1 Part A Mark Scheme"
-If there is no text on this image, return the description of the image. 20 words max.
-If there are not enough data, consider information from the surrounding context.
-Additionally, if the image contains a truncated part, you must describe it and mark as a
-part of some another image that goes before or after current image.
-If the image is of a multiple-choice question’s options, then modify your answer by appending
-'MCQ: A [option] B [option] C [option] D [option]' (replacing [option] with the actual options).
-Otherwise, follow the above instructions strictly.
-"""
-    try:
-        client = genai.Client(api_key=api_key)
-        response = client.models.generate_content(
-            model="gemini-2.0-flash",
-            contents=[
-                {
-                    "parts": [
-                        {"text": prompt_text},
-                        {
-                            "inline_data": {
-                                "mime_type": "image/jpeg",
-                                "data": base64.b64encode(shrunk_data).decode('utf-8')
-                            }
-                        }
-                    ]
-                }
-            ],
-            config=types.GenerateContentConfig(temperature=0.0)
-        )
-        if response and response.text:
-            logger.info(f"[Gemini image description] LLM raw response:\n{response.text}")
-        return response.text.strip() if (response and response.text) else "Image description unavailable"
-    except Exception as e:
-        logger.error(f"Gemini image description error: {e}")
-        return "Image description unavailable"
 class LocalImageWriter:
     """
-    Saves extracted images, then does concurrent Gemini classification
-    and description calls. Finally modifies the Markdown to replace
-    references with final alt text. Also processes table images
-    into row/column cell images.
     """
     def __init__(self, output_folder: str, gemini_api_key: str):
         self.output_folder = output_folder
@@ -301,77 +364,45 @@ class LocalImageWriter:
         self.gemini_api_key = gemini_api_key
     def write(self, path: str, data: bytes) -> None:
-        """
-        Called by magic-pdf to save each extracted image.
-        We store metadata so we can classify the images later.
-        """
         self._img_count += 1
-        local_filename = f"img_{self._img_count}.png"
-        local_path = os.path.join(self.images_dir, local_filename)
-        with open(local_path, "wb") as f:
             f.write(data)
-        rel_path_for_md = os.path.relpath(local_path, self.output_folder)
         self.descriptions[path] = {
             "data": data,
-            "relative_path": rel_path_for_md,
             "table_classification": "NO_TABLE",
             "final_alt": ""
         }
     def post_process(self, key: str, md_content: str) -> str:
-        """
-        1) Table classification calls (concurrent).
-        2) Image description calls for non-table images (concurrent).
-        3) Replace placeholders in the Markdown with final alt text.
-        4) Process table images => row/col cell images => update Markdown.
-        5) Keep only image-reference lines in the final Markdown.
-        """
-        # 1) Table classification (CONCURRENT)
         logger.info("Classifying images to detect tables (concurrent)...")
-        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
-            future_map = {}
             for p, info in self.descriptions.items():
-                fut = executor.submit(call_gemini_for_table_classification, info["data"], self.gemini_api_key)
-                future_map[fut] = p
-            for fut in concurrent.futures.as_completed(future_map):
-                path = future_map[fut]
                 try:
                     classification = fut.result()
                     self.descriptions[path]['table_classification'] = classification
                 except Exception as e:
-                    logger.error(f"Error classifying table for image {path}: {e}")
                     self.descriptions[path]['table_classification'] = "NO_TABLE"
-        # 2) Image description (CONCURRENT), only for NO_TABLE images
-        logger.info("Generating image descriptions for non-table images (concurrent)...")
-        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
-            future_map_desc = {}
-            for p, info in self.descriptions.items():
-                if info['table_classification'] == "NO_TABLE":
-                    fut = executor.submit(call_gemini_for_image_description, info["data"], self.gemini_api_key)
-                    future_map_desc[fut] = p
-            for fut in concurrent.futures.as_completed(future_map_desc):
-                path = future_map_desc[fut]
-                try:
-                    desc = fut.result()
-                    self.descriptions[path]['final_alt'] = desc
-                except Exception as e:
-                    logger.error(f"Error describing image {path}: {e}")
-                    self.descriptions[path]['final_alt'] = "Image description unavailable"
-        # For images classified as 2/3-column tables => set alt
         for p, info in self.descriptions.items():
             cls = info['table_classification']
             if cls == "TWO_COLUMN":
                 info['final_alt'] = "HAS TO BE PROCESSED - two column table"
             elif cls == "THREE_COLUMN":
                 info['final_alt'] = "HAS TO BE PROCESSED - three column table"
-            elif not info['final_alt']:
-                info['final_alt'] = "Image description unavailable"
         # 3) Replace placeholders in the Markdown
         for p, info in self.descriptions.items():
@@ -379,10 +410,10 @@ class LocalImageWriter:
             new_md = f"![{info['final_alt']}]({info['relative_path']})"
             md_content = md_content.replace(old_md, new_md)
-        # 4) Process table images => row/col
         md_content = self._process_table_images_in_markdown(md_content)
-        # 5) Keep only image-reference lines
         final_lines = []
         for line in md_content.split("\n"):
             if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
@@ -391,13 +422,8 @@ class LocalImageWriter:
         return "\n".join(final_lines)
     def _process_table_images_in_markdown(self, md_content: str) -> str:
-        """
-        For images flagged as 2/3-column tables, run TableExtractor,
-        split into row/column cell images, and replace the single
-        table image reference with multiple cell references.
-        """
-        pattern = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
-        matches = re.findall(pattern, md_content, flags=re.IGNORECASE)
         if not matches:
             return md_content
@@ -419,36 +445,38 @@ class LocalImageWriter:
                         enable_subtopic_merge=False,
                         subtopic_threshold=0.2
                     )
                 row_boxes = extractor.process_image(abs_image_path)
                 out_folder = abs_image_path + "_rows"
                 os.makedirs(out_folder, exist_ok=True)
                 extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
-                snippet_lines = ["**Extracted table cells:**"]
                 for i, row in enumerate(row_boxes):
                     row_dir = os.path.join(out_folder, f"row_{i}")
                     for j, _ in enumerate(row):
-                        cell_filename = f"col_{j}.png"
-                        cell_abs_path = os.path.join(row_dir, cell_filename)
-                        cell_rel_path = os.path.relpath(cell_abs_path, self.output_folder)
-                        snippet_lines.append(f"![Row {i} Col {j}]({cell_rel_path})")
-                new_snippet = "\n".join(snippet_lines)
                 old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_path})"
-                md_content = md_content.replace(old_line, new_snippet)
             except Exception as e:
                 logger.error(f"Error processing table image {image_path}: {e}")
         return md_content
 class MineruNoTextProcessor:
     """
-    1) Extracts page ranges from the PDF's table of contents (via Gemini).
-    2) Creates a subset PDF in memory for those pages.
-    3) Runs magic-pdf analysis on the subset PDF.
-    4) Generates a Markdown file with images, including table images
-       split into row/column cells, with concurrency for Gemini calls.
     """
     def __init__(self, output_folder: str, gemini_api_key: str = None):
         self.output_folder = output_folder
@@ -456,11 +484,11 @@ class MineruNoTextProcessor:
         self.layout_model = "doclayout_yolo"
         self.formula_enable = True
-        # keep table_enable=False so that entire table is an image
         self.table_enable = False
         self.language = "en"
-        self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=15)
         self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
     def cleanup_gpu(self):
@@ -474,41 +502,71 @@ class MineruNoTextProcessor:
     def process(self, pdf_path: str) -> str:
         logger.info(f"Processing PDF: {pdf_path}")
         try:
-            # 1) Extract subtopics from the PDF's contents
-            topics_data = self.subtopic_extractor.extract_subtopics(pdf_path)
-            if not topics_data:
-                raise ValueError("No valid topics extracted from the PDF's table of contents.")
-            # 2) Flatten page indices from all topics (1-based)
-            page_indices = self._collect_page_indices(topics_data)
-            if not page_indices:
-                raise ValueError("Extracted page indices are empty.")
-            # 3) Read the original PDF into memory
             with open(pdf_path, "rb") as f:
-                original_pdf_bytes = f.read()
-            # 4) Validate pages and create subset (convert 1-based to 0-based)
-            doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
             total_pages = doc.page_count
             doc.close()
-            zero_based = []
-            for p in page_indices:
-                z = p - 1
-                if 0 <= z < total_pages:
-                    zero_based.append(z)
-                else:
-                    logger.error(f"Page {p} (converted to {z}) is out of 1..{total_pages}")
-                    raise ValueError(f"Page {p} is out of valid range.")
-            zero_based = sorted(set(zero_based))
-            if not zero_based:
-                raise ValueError("No valid pages after conversion to 0-based indices.")
-            logger.info(f"Processing pages (0-based): {zero_based}")
-            subset_pdf_bytes = create_subset_pdf(original_pdf_bytes, zero_based)
-            # 5) Run magic-pdf analysis on the subset PDF
             dataset = PymuDocDataset(subset_pdf_bytes)
             inference = doc_analyze(
                 dataset,
@@ -520,52 +578,35 @@ class MineruNoTextProcessor:
             )
             logger.info("doc_analyze complete. Extracting images...")
-            # 6) Convert to Markdown (images only) via pipe_ocr_mode
-            image_writer = LocalImageWriter(self.output_folder, gemini_api_key=self.gemini_api_key)
-            pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
             md_content = pipe_result.get_markdown("local-unique-prefix/")
-            # 7) Post-process => concurrent table classification / description => final MD
-            final_markdown = image_writer.post_process("local-unique-prefix/", md_content)
-            # 8) Save final Markdown
-            md_path = os.path.join(self.output_folder, "final_output.md")
-            with open(md_path, "w", encoding="utf-8") as f:
                 f.write(final_markdown)
-            logger.info(f"Markdown saved to: {md_path}")
             return final_markdown
         finally:
             self.cleanup_gpu()
-    def _collect_page_indices(self, topics_data: Dict[str, Any]) -> List[int]:
-        """
-        Flatten the subtopic ranges into a list of pages (1-based).
-        Example: {"Topic A": [11,29], "Topic B": [30,42]} => [11..29, 30..42]
-        """
-        pages = []
-        for topic, rng in topics_data.items():
-            if isinstance(rng, list) and len(rng) == 2:
-                start_p, end_p = rng
-                if start_p > end_p:
-                    logger.error(f"Invalid page range for topic '{topic}': {rng}")
-                    raise ValueError(f"Invalid page range for topic '{topic}': {rng}")
-                pages.extend(range(start_p, end_p + 1))
-            else:
-                logger.warning(f"Skipping topic '{topic}' with invalid range: {rng}")
-        return pages
 if __name__ == "__main__":
     input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
-    output_dir = "/home/user/app/input_output/outpu"
     gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
     try:
         processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
-        final_md = processor.process(input_pdf)
         print("Final Markdown Output:")
-        print(final_md)
     except Exception as e:
         logger.error(f"Processing failed: {e}")

 import logging
 import fitz
 import base64
+import concurrent.futures
 from io import BytesIO
 from typing import List, Dict, Any
 import torch
+import cv2
+import numpy as np
+# Attempt top-level import of google.genai
+try:
+    from google import genai
+    from google.genai import types
+except ImportError:
+    genai = None
+    types = None
+# magic-pdf imports
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+# table extraction logic
 from table_row_extraction import TableExtractor
+###############################################################################
+# Logging Setup
+###############################################################################
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
+###############################################################################
+# PDF Utility Functions
+###############################################################################
+def unify_whitespace(text: str) -> str:
+    """
+    Replace runs of whitespace with a single space, strip leading/trailing, then lowercase.
+    """
+    return re.sub(r"\s+", " ", text).strip().lower()
 def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
     """
+    Creates a new PDF (in memory) containing only pages in page_indices (0-based).
+    Raises ValueError if page_indices is empty or out of range.
     """
     if not page_indices:
         raise ValueError("No page indices provided for subset creation.")
     doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
     new_doc = fitz.open()
+    for p in sorted(set(page_indices)):
         if 0 <= p < doc.page_count:
             new_doc.insert_pdf(doc, from_page=p, to_page=p)
         else:
+            logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
+            raise ValueError(f"Page index {p} out of range.")
     subset_bytes = new_doc.tobytes()
     new_doc.close()
     doc.close()
     return subset_bytes
+###############################################################################
+# Searching in PDF
+###############################################################################
+def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
     """
+    Return a sorted list of 0-based pages in which `search_text` (normalized) appears,
+    scanning the entire PDF in RAW mode.
     """
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    st_norm = unify_whitespace(search_text)
+    found = []
+    for i in range(doc.page_count):
+        raw = doc[i].get_text("raw")
+        norm = unify_whitespace(raw)
+        if st_norm in norm:
+            found.append(i)
+    doc.close()
+    return sorted(found)
+###############################################################################
+# Gemini LLM for Subtopic Extraction
+###############################################################################
 class GeminiTopicExtractor:
     """
+    Extract subtopics from the PDF by reading the first `num_pages` pages, calling Gemini.
+    We expect a structure like:
+      {
+        "2 Subject content and assessment information": {
+          "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
+          "Paper 3: Statistics and Mechanics": [30, 42]
+        }
+      }
+    or sometimes just a flat dict:
+      {
+        "Paper 1 and Paper 2: Pure Mathematics": [15, 33],
+        "Paper 3: Statistics and Mechanics": [34, 46]
+      }
+    We'll parse both forms.
     """
+    def __init__(self, api_key: str = None, num_pages: int = 10):
         self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
         if not self.api_key:
+            logger.warning("No Gemini API key for subtopic extraction.")
         self.num_pages = num_pages
+    def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
+        """
+        Return a dict of subtopics => [start_page, end_page].
+        Could be empty if parsing fails or the LLM can't find subtopics.
+        """
+        first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
+        if not first_pages_text.strip():
+            logger.error("No text from first pages => cannot extract subtopics.")
             return {}
         if genai is None or types is None:
             return {}
         prompt = f"""
+You have the first pages of a PDF specification, including a table of contents.
 Instructions:
+1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
+2. Extract subtopic names -> [start_page, end_page], in valid JSON format only.
+3. If you can't find any subtopics, return an empty JSON.
+Examples:
+1. Given this table of contents:
+1 Introduction – 2
+    Why choose Edexcel A Level Mathematics? - 2
+    Supporting you in planning and implementing this qualification - 3
+    Qualification at a glance - 5
+2 Subject content and assessment information – 7
+    Paper 1 and Paper 2: Pure Mathematics - 11
+    Paper 3: Statistics and Mechanics - 30
+    Assessment Objectives - 40
+3 Administration and general information – 42
+    Entries - 42
+    Access arrangements, reasonable adjustments, special consideration and malpractice - 42
+    Student recruitment and progression - 45
+Appendix 1: Formulae – 49
+Appendix 2: Notation – 53
+Appendix 3: Use of calculators – 59
+Appendix 4: Assessment Objectives – 60
+Appendix 5: The context for the development of this qualification – 62
+Appendix 6: Transferable skills – 64
+Appendix 7: Level 3 Extended Project qualification – 65
+Appendix 8: Codes – 67
+The correct output should be:
+{{
+    "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
+    "Paper 3: Statistics and Mechanics": [30, 42]
+}}
+2. Given this table of contents:
+Qualification at a glance – 1
+    Assessment Objectives and weightings - 4
+Knowledge, skills and understanding – 5
+    Theme 1: Introduction to markets and market failure - 5
+    Theme 2: The UK economy – performance and policies - 11
+    Theme 3: Business behaviour and the labour market - 21
+    Theme 4: A global perspective - 29
+Assessment – 39
+    Assessment summary - 39
+    Assessment objectives - 41
+    Assessment overview - 42
+    Breakdown of assessment objectives - 42
+        Synoptic assessment - 43
+        Discount code and performance tables - 43
+        Access arrangements, reasonable adjustments and special consideration - 44
+        Malpractice - 45
+        Equality Act 2010 and Pearson equality policy - 45
+        Synoptic assessment - 46
+        Awarding and reporting - 47
+Other information – 49
+    Student recruitment -49
+    Prior learning and other requirements -49
+    Progression - 49
+Appendix 1: Transferable skills – 53
+Appendix 2: Level 3 Extended Project qualification – 55
+Appendix 3: Quantitative skills – 59
+Appendix 4: Codes – 61
+Appendix 5: Index – 63
+The correct output should be:
+{{
+    "Theme 1: Introduction to markets and market failure": [5, 10]
+    "Theme 2: The UK economy – performance and policies": - [11, 20]
+    "Theme 3: Business behaviour and the labour market": [21, 28]
+    "Theme 4: A global perspective": [29, 38]
+}}
+Now, extract topics from this text:
+{first_pages_text}
 """
         try:
             client = genai.Client(api_key=self.api_key)
             response = client.models.generate_content(
                 model="gemini-2.0-flash",
                 contents=[prompt],
                 config=types.GenerateContentConfig(temperature=0.0)
             )
+            if not response or not response.text:
+                logger.warning("No text from LLM => returning empty subtopics.")
+                return {}
+            raw_json = response.text.strip()
+            # Clean up triple backticks
+            cleaned = raw_json.replace("```json", "").replace("```", "")
+            # Attempt to parse
             data = json.loads(cleaned)
+            # data might be nested or flat
+            # if nested, e.g. {"2 Subject content": {"Paper 1...": [11,29]}}
+            # if flat, e.g. {"Paper 1...": [11,29]}
+            # We'll unify it to a single dict of subname => [start,end].
+            final_dict = {}
+            # If the top-level is a dict of dict
+            # We look for a dict whose values are themselves subtopics
+            # Or it might be a direct subtopic dict
+            # We'll try a quick approach:
+            #   - If any top-level value is a dict with numeric arrays, use that
+            #   - else assume data is the direct subtopic dict
+            found_sub_dict = None
+            for k, v in data.items():
+                if isinstance(v, dict):
+                    # might be the sub-sub dict
+                    found_sub_dict = v
+                    break
+            if found_sub_dict is not None:
+                # parse found_sub_dict
+                for subk, rng in found_sub_dict.items():
+                    if isinstance(rng, list) and len(rng) == 2:
+                        final_dict[subk] = rng
+            else:
+                # maybe data is the direct subtopic dict
+                # parse data
+                for subk, rng in data.items():
+                    if isinstance(rng, list) and len(rng) == 2:
+                        final_dict[subk] = rng
+            return final_dict
         except Exception as e:
             logger.error(f"Gemini subtopic extraction error: {e}")
             return {}
+    def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
         text_parts = []
         try:
             doc = fitz.open(pdf_path)
+            pages_to_read = min(num_pages, doc.page_count)
+            for i in range(pages_to_read):
+                raw_text = doc[i].get_text("raw")
+                text_parts.append(raw_text)
             doc.close()
         except Exception as e:
+            logger.error(f"Could not open PDF: {e}")
         return "\n".join(text_parts)
+###############################################################################
+# Concurrency for Table Classification
+###############################################################################
 def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str:
     """
+    Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE using Gemini.
     """
     if not api_key:
+        logger.warning("No Gemini API key => NO_TABLE.")
         return "NO_TABLE"
+    if genai is None or types is None:
+        logger.warning("google.genai not installed => NO_TABLE.")
         return "NO_TABLE"
+    # Attempt to shrink
+    try:
+        arr = np.frombuffer(image_data, np.uint8)
+        img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+        if img is not None:
+            h, w, _ = img.shape
+            max_dim = 800
+            scale = 1.0
+            if max(h, w) > max_dim:
+                scale = max_dim / float(max(h, w))
+            if scale < 1.0:
+                new_w = int(w * scale)
+                new_h = int(h * scale)
+                img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
+            encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), 70]
+            success, enc = cv2.imencode(".jpg", img, encode_params)
+            if success:
+                image_data = enc.tobytes()
+    except Exception as e:
+        logger.warning(f"shrink_image_to_jpeg error: {e}")
+    prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
+Return only one label:
 TWO_COLUMN
 THREE_COLUMN
 NO_TABLE
 """
     try:
         client = genai.Client(api_key=api_key)
+        resp = client.models.generate_content(
             model="gemini-2.0-flash",
             contents=[
                 {
                         {
                             "inline_data": {
                                 "mime_type": "image/jpeg",
+                                "data": base64.b64encode(image_data).decode('utf-8')
                             }
                         }
                     ]
             ],
             config=types.GenerateContentConfig(temperature=0.0)
         )
+        if resp and resp.text:
+            classification = resp.text.strip().upper()
+            if "THREE" in classification:
+                return "THREE_COLUMN"
+            elif "TWO" in classification:
+                return "TWO_COLUMN"
+        return "NO_TABLE"
     except Exception as e:
         logger.error(f"Gemini table classification error: {e}")
         return "NO_TABLE"
+###############################################################################
+# LocalImageWriter
+###############################################################################
 class LocalImageWriter:
     """
+    Writes extracted images, then does concurrency-based table classification calls.
     """
     def __init__(self, output_folder: str, gemini_api_key: str):
         self.output_folder = output_folder
         self.gemini_api_key = gemini_api_key
     def write(self, path: str, data: bytes) -> None:
         self._img_count += 1
+        fname = f"img_{self._img_count}.png"
+        fpath = os.path.join(self.images_dir, fname)
+        with open(fpath, "wb") as f:
             f.write(data)
+        rel_path = os.path.relpath(fpath, self.output_folder)
         self.descriptions[path] = {
             "data": data,
+            "relative_path": rel_path,
             "table_classification": "NO_TABLE",
             "final_alt": ""
         }
     def post_process(self, key: str, md_content: str) -> str:
         logger.info("Classifying images to detect tables (concurrent)...")
+        with concurrent.futures.ThreadPoolExecutor(max_workers=6) as exe:
+            fut_map = {}
             for p, info in self.descriptions.items():
+                fut = exe.submit(call_gemini_for_table_classification, info["data"], self.gemini_api_key)
+                fut_map[fut] = p
+            for fut in concurrent.futures.as_completed(fut_map):
+                path = fut_map[fut]
                 try:
                     classification = fut.result()
                     self.descriptions[path]['table_classification'] = classification
                 except Exception as e:
+                    logger.error(f"Table classification error: {e}")
                     self.descriptions[path]['table_classification'] = "NO_TABLE"
+        # 2) Set final alt text
         for p, info in self.descriptions.items():
             cls = info['table_classification']
             if cls == "TWO_COLUMN":
                 info['final_alt'] = "HAS TO BE PROCESSED - two column table"
             elif cls == "THREE_COLUMN":
                 info['final_alt'] = "HAS TO BE PROCESSED - three column table"
+            else:
+                info['final_alt'] = "NO_TABLE image"
         # 3) Replace placeholders in the Markdown
         for p, info in self.descriptions.items():
             new_md = f"![{info['final_alt']}]({info['relative_path']})"
             md_content = md_content.replace(old_md, new_md)
+        # 4) If any table images => extract rows
         md_content = self._process_table_images_in_markdown(md_content)
+        # 5) Keep only lines that are image references
         final_lines = []
         for line in md_content.split("\n"):
             if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
         return "\n".join(final_lines)
     def _process_table_images_in_markdown(self, md_content: str) -> str:
+        pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
+        matches = re.findall(pat, md_content, flags=re.IGNORECASE)
         if not matches:
             return md_content
                         enable_subtopic_merge=False,
                         subtopic_threshold=0.2
                     )
                 row_boxes = extractor.process_image(abs_image_path)
                 out_folder = abs_image_path + "_rows"
                 os.makedirs(out_folder, exist_ok=True)
                 extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
+                snippet = ["**Extracted table cells:**"]
                 for i, row in enumerate(row_boxes):
                     row_dir = os.path.join(out_folder, f"row_{i}")
                     for j, _ in enumerate(row):
+                        cell_file = f"col_{j}.png"
+                        cell_path = os.path.join(row_dir, cell_file)
+                        relp = os.path.relpath(cell_path, self.output_folder)
+                        snippet.append(f"![Row {i} Col {j}]({relp})")
+                new_snip = "\n".join(snippet)
                 old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_path})"
+                md_content = md_content.replace(old_line, new_snip)
             except Exception as e:
                 logger.error(f"Error processing table image {image_path}: {e}")
         return md_content
+###############################################################################
+# MineruNoTextProcessor
+###############################################################################
 class MineruNoTextProcessor:
     """
+    1) Use Gemini to get subtopics => e.g. {"Paper 1 and Paper 2: Pure Mathematics": [11,29], ...}
+    2) For each subtopic name => find real occurrence in PDF at or after (start_page-1).
+    3) offset = occurrence_page - (start_page-1). clamp offset >= 0
+    4) Flatten final pages, subset PDF, run magic-pdf => concurrency => final MD
+    5) If no subtopics found, process entire PDF as fallback.
     """
     def __init__(self, output_folder: str, gemini_api_key: str = None):
         self.output_folder = output_folder
         self.layout_model = "doclayout_yolo"
         self.formula_enable = True
         self.table_enable = False
         self.language = "en"
+        # Use our new flexible approach
+        self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=10)
         self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
     def cleanup_gpu(self):
     def process(self, pdf_path: str) -> str:
         logger.info(f"Processing PDF: {pdf_path}")
         try:
+            # 1) Extract subtopics from Gemini
+            subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
+            logger.info(f"Gemini returned subtopics: {subtopics}")
+            # 2) Read entire PDF
             with open(pdf_path, "rb") as f:
+                pdf_bytes = f.read()
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
             total_pages = doc.page_count
             doc.close()
+            final_pages = set()
+            if not subtopics:
+                logger.warning("No subtopics found. We'll process the entire PDF as fallback.")
+                final_pages = set(range(total_pages))
+            else:
+                # For each subtopic, find occurrence >= (start_p-1)
+                for subname, rng in subtopics.items():
+                    if not (isinstance(rng, list) and len(rng) == 2):
+                        logger.warning(f"Skipping subtopic '{subname}' => invalid range {rng}")
+                        continue
+                    start_p, end_p = rng
+                    if start_p > end_p:
+                        logger.warning(f"Skipping subtopic '{subname}' => start> end {rng}")
+                        continue
+                    # find occurrences
+                    occs = find_all_occurrences(pdf_bytes, subname)
+                    logger.info(f"Occurrences of subtopic '{subname}': {occs}")
+                    doc_start_0 = start_p - 1
+                    chosen_page = None
+                    for p in occs:
+                        if p >= doc_start_0:
+                            chosen_page = p
+                            break
+                    if chosen_page is None:
+                        # fallback to last or 0
+                        if occs:
+                            chosen_page = occs[-1]
+                            logger.warning(f"No occurrence >= {doc_start_0} for '{subname}'. Using last => {chosen_page}")
+                        else:
+                            chosen_page = 0
+                            logger.warning(f"No occurrences for '{subname}'. Using page 0.")
+                    raw_offset = chosen_page - doc_start_0
+                    offset = max(0, raw_offset)
+                    logger.info(f"Subtopic '{subname}': doc_start={start_p}, chosen_page={chosen_page}, raw_offset={raw_offset}, offset={offset}")
+                    s0 = (start_p - 1) + offset
+                    e0 = (end_p - 1) + offset
+                    s0 = max(0, min(total_pages - 1, s0))
+                    e0 = max(0, min(total_pages - 1, e0))
+                    for pp in range(s0, e0 + 1):
+                        final_pages.add(pp)
+            # 3) If final_pages is empty => fallback entire PDF
+            if not final_pages:
+                logger.warning("No valid pages after offset. We'll process entire PDF.")
+                final_pages = set(range(total_pages))
+            logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
+            subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
+            # 4) doc_analyze => concurrency => final MD
             dataset = PymuDocDataset(subset_pdf_bytes)
             inference = doc_analyze(
                 dataset,
             )
             logger.info("doc_analyze complete. Extracting images...")
+            writer = LocalImageWriter(self.output_folder, self.gemini_api_key)
+            pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
             md_content = pipe_result.get_markdown("local-unique-prefix/")
+            final_markdown = writer.post_process("local-unique-prefix/", md_content)
+            # 5) Save
+            out_path = os.path.join(self.output_folder, "final_output.md")
+            with open(out_path, "w", encoding="utf-8") as f:
                 f.write(final_markdown)
+            logger.info(f"Markdown saved to: {out_path}")
             return final_markdown
         finally:
             self.cleanup_gpu()
+###############################################################################
+# Example Main
+###############################################################################
 if __name__ == "__main__":
     input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
+    output_dir = "/home/user/app/output"
     gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
     try:
         processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
+        md_output = processor.process(input_pdf)
         print("Final Markdown Output:")
+        print(md_output)
     except Exception as e:
         logger.error(f"Processing failed: {e}")