MinerU

Paused

App Files Files Community

SkyNait commited on Mar 5

Commit

9351a05

1 Parent(s): c10a9aa

page handling

Browse files

Files changed (7) hide show

__pycache__/inference_svm_model.cpython-310.pyc +0 -0
__pycache__/mineru_single.cpython-310.pyc +0 -0
__pycache__/table_row_extraction.cpython-310.pyc +0 -0
__pycache__/topic_extraction.cpython-310.pyc +0 -0
__pycache__/worker.cpython-310.pyc +0 -0
page_range.py +258 -0
topic_extr.py +142 -901

__pycache__/inference_svm_model.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ

__pycache__/mineru_single.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ

__pycache__/table_row_extraction.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/table_row_extraction.cpython-310.pyc and b/__pycache__/table_row_extraction.cpython-310.pyc differ

__pycache__/topic_extraction.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/topic_extraction.cpython-310.pyc and b/__pycache__/topic_extraction.cpython-310.pyc differ

__pycache__/worker.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ

page_range.py ADDED Viewed

	@@ -0,0 +1,258 @@

+#!/usr/bin/env python3
+import os
+import re
+import json
+import logging
+import fitz
+import requests
+from statistics import mode, median
+from google import genai
+from google.genai import types
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> list:
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    st_norm = re.sub(r"\s+", " ", search_text).strip()
+    found = []
+    for i in range(doc.page_count):
+        raw = doc[i].get_text("raw")
+        norm = re.sub(r"\s+", " ", raw).strip()
+        if st_norm in norm:
+            found.append(i)
+    doc.close()
+    return sorted(found)
+class GeminiTopicExtractor:
+    def __init__(self, api_key: str = None, num_pages: int = 20):
+        self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
+        self.num_pages = num_pages
+    def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
+        text_parts = []
+        try:
+            if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
+                response = requests.get(pdf_path)
+                if response.status_code != 200:
+                    logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
+                    return ""
+                pdf_bytes = response.content
+            else:
+                with open(pdf_path, "rb") as f:
+                    pdf_bytes = f.read()
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            pages_to_read = min(num_pages, doc.page_count)
+            for i in range(pages_to_read):
+                raw_text = doc[i].get_text("raw")
+                text_parts.append(raw_text)
+            doc.close()
+        except Exception as e:
+            logger.error(f"Could not open PDF: {e}")
+        return "\n".join(text_parts)
+    def extract_subtopics(self, pdf_path: str) -> dict:
+        first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
+        if not first_pages_text.strip():
+            logger.error("No text from first pages => cannot extract subtopics.")
+            return {}
+        prompt = f"""
+You have the first pages of a PDF specification, including a table of contents.
+Instructions:
+1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
+2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
+3. For each subtopic, give the range of pages [start_page, end_page -1] (1-based) from the table of contents.
+4. Output only valid JSON of the form:
+    {{
+    "Subtopic A": [start_page, end_page],
+    "Subtopic B": [start_page, end_page]
+    }}
+5. If you can't find any subtopics, return an empty JSON.
+Important notes:
+- The correct "end_page" must be the page number of the next topic or subtopic minus 1.
+- The final output must be valid JSON only, with no extra text or code blocks.
+Examples:
+1. Given this table of contents:
+1 Introduction – 2
+    Why choose Edexcel A Level Mathematics? - 2
+    Supporting you in planning and implementing this qualification - 3
+    Qualification at a glance - 5
+2 Subject content and assessment information – 7
+    Paper 1 and Paper 2: Pure Mathematics - 11
+    Paper 3: Statistics and Mechanics - 30
+    Assessment Objectives - 40
+3 Administration and general information – 42
+    Entries - 42
+    Access arrangements, reasonable adjustments, special consideration and malpractice - 42
+    Student recruitment and progression - 45
+The correct output should be:
+{{
+    "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
+    "Paper 3: Statistics and Mechanics": [30, 38]
+}}
+2. Given this table of contents:
+Qualification at a glance – 1
+    Assessment Objectives and weightings - 4
+Knowledge, skills and understanding – 5
+    Theme 1: Introduction to markets and market failure - 5
+    Theme 2: The UK economy – performance and policies - 11
+    Theme 3: Business behaviour and the labour market - 21
+    Theme 4: A global perspective - 29
+Assessment – 39
+    Assessment summary - 39
+    Assessment objectives - 41
+    Assessment overview - 42
+The correct output should be:
+{{
+    "Theme 1: Introduction to markets and market failure": [5, 10],
+    "Theme 2: The UK economy – performance and policies": [11, 20],
+    "Theme 3: Business behaviour and the labour market": [21, 28],
+    "Theme 4: A global perspective": [29, 38]
+}}
+Now, extract topics from this text:
+{first_pages_text}
+"""
+        global _GEMINI_CLIENT
+        if '_GEMINI_CLIENT' not in globals() or _GEMINI_CLIENT is None:
+            _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
+        client = _GEMINI_CLIENT
+        try:
+            response = client.models.generate_content(
+                model="gemini-2.0-flash",
+                contents=[prompt],
+                config=types.GenerateContentConfig(temperature=0.0)
+            )
+            if not response or not response.text:
+                logger.warning("No text from LLM => returning empty subtopics.")
+                return {}
+            raw_json = response.text.strip()
+            cleaned = raw_json.replace("```json", "").replace("```", "")
+            try:
+                data = json.loads(cleaned)
+            except Exception as json_err:
+                logger.error(f"JSON parsing error: {json_err}")
+                return {}
+            final_dict = {}
+            found_sub_dict = None
+            for k, v in data.items():
+                if isinstance(v, dict):
+                    found_sub_dict = v
+                    break
+            if found_sub_dict is not None:
+                for subk, rng in found_sub_dict.items():
+                    if isinstance(rng, list) and len(rng) == 2:
+                        final_dict[subk] = rng
+            else:
+                for subk, rng in data.items():
+                    if isinstance(rng, list) and len(rng) == 2:
+                        final_dict[subk] = rng
+            return final_dict
+        except Exception as e:
+            logger.error(f"Gemini subtopic extraction error: {e}")
+            return {}
+class TopicRangeExtractor:
+    def __init__(self, gemini_api_key: str):
+        self.gemini_api_key = gemini_api_key
+        self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
+    def process(self, pdf_path: str) -> dict:
+        logger.info(f"Processing PDF: {pdf_path}")
+        subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
+        logger.info(f"Gemini returned subtopics: {subtopics}")
+        if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
+            response = requests.get(pdf_path)
+            if response.status_code != 200:
+                logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
+                raise Exception(f"Failed to download PDF: {pdf_path}")
+            pdf_bytes = response.content
+            logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
+        else:
+            with open(pdf_path, "rb") as f:
+                pdf_bytes = f.read()
+            logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        total_pages = doc.page_count
+        doc.close()
+        # Compute global offset and adjust subtopic ranges.
+        if not subtopics:
+            global_offset = 0
+            subtopics_corrected = {}
+        else:
+            offset_candidates = []
+            subtopics_corrected = {}
+            for subname, rng in subtopics.items():
+                if not (isinstance(rng, list) and len(rng) == 2):
+                    continue
+                start_p, end_p = rng
+                occs = find_all_occurrences(pdf_bytes, subname)
+                for p in occs:
+                    candidate = p - (start_p - 1)
+                    if candidate > 0:
+                        offset_candidates.append(candidate)
+                subtopics_corrected[subname] = rng
+            if offset_candidates:
+                try:
+                    global_offset = mode(offset_candidates)
+                except Exception:
+                    global_offset = int(median(offset_candidates))
+            else:
+                global_offset = 0
+            logger.info(f"Computed global offset: {global_offset}")
+        # Adjust ranges by applying the global offset.
+        adjusted_topics = {}
+        for subname, rng in subtopics_corrected.items():
+            start_p, end_p = rng
+            s0 = (start_p - 1) + global_offset
+            e0 = (end_p - 1) + global_offset
+            adjusted_topics[subname] = [s0, e0]
+        # Sort the topics by their adjusted start page.
+        sorted_topics = sorted(adjusted_topics.items(), key=lambda item: item[1][0])
+        effective_ranges = {}
+        # For each subtopic, if there is a next one, set its effective end to the next topic's start minus 1.
+        for i, (name, (start, end)) in enumerate(sorted_topics):
+            if i < len(sorted_topics) - 1:
+                next_start = sorted_topics[i+1][1][0]
+                effective_end = min(end, next_start - 1)
+            else:
+                effective_end = end
+            effective_ranges[name] = [start, effective_end]
+        # Build the union of pages from each effective range.
+        # For every topic except the last, use a half-open range to skip the boundary page.
+        real_pages_set = set()
+        for i, (name, (start, end)) in enumerate(sorted_topics):
+            if i < len(sorted_topics) - 1:
+                # End is exclusive so the boundary page (end) is skipped.
+                for pp in range(start, end):
+                    if 0 <= pp < total_pages:
+                        real_pages_set.add(pp)
+            else:
+                # For the last topic include the end page.
+                for pp in range(start, end + 1):
+                    if 0 <= pp < total_pages:
+                        real_pages_set.add(pp)
+        page_range = sorted(real_pages_set)
+        return {
+            "page_range": page_range
+        }
+if __name__ == "__main__":
+    input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
+    gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
+    try:
+        extractor = TopicRangeExtractor(gemini_api_key=gemini_key)
+        result = extractor.process(input_pdf)
+        print(json.dumps(result, indent=2))
+    except Exception as e:
+        logger.error(f"Processing failed: {e}")

topic_extr.py CHANGED Viewed

@@ -1,57 +1,22 @@
 #!/usr/bin/env python3
 import os
-import re
-import gc
 import json
 import logging
 import fitz
-import boto3
-import base64
-import time
-import asyncio
-import tempfile
 import requests
-from io import BytesIO
-from typing import List, Dict, Any
 import torch
-import cv2
-import numpy as np
-from google import genai
-from google.genai import types
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.data.data_reader_writer.base import DataWriter
-from table_row_extraction import TableExtractor
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-file_handler = logging.FileHandler("topic_extraction.log")
-file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s - %(message)s"))
-logger.addHandler(file_handler)
-_GEMINI_CLIENT = None
-# helper functions, also global
-def unify_whitespace(text: str) -> str:
-    return re.sub(r"\s+", " ", text).strip()
-def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    st_norm = unify_whitespace(search_text)
-    found = []
-    for i in range(doc.page_count):
-        raw = doc[i].get_text("raw")
-        norm = unify_whitespace(raw)
-        if st_norm in norm:
-            found.append(i)
-    doc.close()
-    return sorted(found)
-def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
     if not page_indices:
         raise ValueError("No page indices provided for subset creation.")
     doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
@@ -67,121 +32,33 @@ def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> byt
     doc.close()
     return subset_bytes
-def unify_topic_name(raw_title: str, children_subtopics: list) -> str:
-    """
-    Clean up a topic title:
-    - Remove any trailing "continued".
-    - If the title does not start with a number but children provide a consistent numeric prefix,
-      then prepend that prefix.
-    """
-    title = raw_title.strip()
-    # Remove trailing "continued"
-    title = re.sub(r"\s+continued\s*$", "", title, flags=re.IGNORECASE)
-    # If title already starts with a number, use it as is.
-    if re.match(r"^\d+", title):
-        return title
-    # Otherwise, try to deduce a numeric prefix from the children.
-    prefixes = []
-    for child in children_subtopics:
-        child_title = child.get("title", "").strip()
-        m = re.match(r"^(\d+)\.", child_title)
-        if m:
-            prefixes.append(m.group(1))
-    if prefixes:
-        # If all numeric prefixes in children are the same, use that prefix.
-        if all(p == prefixes[0] for p in prefixes):
-            # If title is non-empty, prepend the number; otherwise, use a fallback.
-            if title:
-                title = f"{prefixes[0]} {title}"
-            else:
-                title = f"{prefixes[0]} Topic"
-    # Optionally, handle known broken titles explicitly.
-    if title.lower() in {"gonometry"}:
-        # For example, if children indicate "5.X", set to "5 Trigonometry"
-        if prefixes and prefixes[0] == "5":
-            title = "5 Trigonometry"
-    return title
-def merge_topics(subtopic_list: list) -> list:
     """
-    Merge topics with an enhanced logic:
-    1. Clean up each topic's title using unify_topic_name.
-    2. Group topics by the parent's numeric prefix (if available). Topics without a numeric prefix use their title.
-    3. Reassign children: for each child whose title (e.g. "2.1") does not match its current parent's numeric prefix,
-       move it to the parent with the matching prefix if available.
-    4. Remove duplicate children by merging contents.
-    5. Sort parent topics and each parent's children by their numeric ordering.
     """
-    # First, merge topics by parent's numeric prefix.
-    merged = {}
-    for topic_obj in subtopic_list:
-        raw_title = topic_obj.get("title", "")
-        children = topic_obj.get("children", [])
-        contents = topic_obj.get("contents", [])
-        new_title = unify_topic_name(raw_title, children)
-        # Extract parent's numeric prefix, if present.
-        m = re.match(r"^(\d+)", new_title)
-        parent_prefix = m.group(1) if m else None
-        key = parent_prefix if parent_prefix is not None else new_title
-        if key not in merged:
-            merged[key] = {
-                "title": new_title,
-                "contents": list(contents),
-                "children": list(children),
-            }
         else:
-            # Merge contents and children; choose the longer title.
-            if len(new_title) > len(merged[key]["title"]):
-                merged[key]["title"] = new_title
-            merged[key]["contents"].extend(contents)
-            merged[key]["children"].extend(children)
-    # Build a lookup of merged topics by their numeric prefix.
-    parent_lookup = merged  # keys are numeric prefixes or the full title for non-numeric ones.
-    # Reassign children to the correct parent based on their numeric prefix.
-    for key, topic in merged.items():
-        new_children = []
-        for child in topic["children"]:
-            child_title = child.get("title", "").strip()
-            m_child = re.match(r"^(\d+)\.", child_title)
-            if m_child:
-                child_prefix = m_child.group(1)
-                if key != child_prefix and child_prefix in parent_lookup:
-                    # Reassign this child to the proper parent.
-                    parent_lookup[child_prefix]["children"].append(child)
-                    continue
-            new_children.append(child)
-        topic["children"] = new_children
-    # Remove duplicate children by merging their contents.
-    for topic in merged.values():
-        child_map = {}
-        for child in topic["children"]:
-            ctitle = child.get("title", "").strip()
-            if ctitle not in child_map:
-                child_map[ctitle] = child
-            else:
-                child_map[ctitle]["contents"].extend(child.get("contents", []))
-                child_map[ctitle]["children"].extend(child.get("children", []))
-        topic["children"] = list(child_map.values())
-        # Sort children by full numeric order (e.g. "2.1" < "2.10" < "2.2").
-        def parse_subtopic_num(subtitle):
-            digits = re.findall(r"\d+", subtitle)
-            return tuple(int(d) for d in digits) if digits else (9999,)
-        topic["children"].sort(key=lambda ch: parse_subtopic_num(ch.get("title", "")))
-    # Convert merged topics to a sorted list.
-    def parse_parent_num(topic):
-        m = re.match(r"^(\d+)", topic.get("title", ""))
-        return int(m.group(1)) if m else 9999
-    final_list = list(merged.values())
-    final_list.sort(key=lambda topic: parse_parent_num(topic))
-    return final_list
 class s3Writer:
     def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
@@ -195,676 +72,44 @@ class s3Writer:
     def write(self, path: str, data: bytes) -> None:
         try:
             file_obj = BytesIO(data)
-            self.client.upload_fileobj(
-                file_obj,
-                self.bucket,
-                path
-            )
             logger.info(f"Uploaded to S3: {path}")
         except Exception as e:
             logger.error(f"Failed to upload to S3: {str(e)}")
             raise
-    def delete(self, path: str) -> None:
-        try:
-            self.client.delete_object(Bucket=self.bucket, Key=path)
-        except Exception as e:
-            logger.error(f"Failed to delete from S3: {str(e)}")
-            raise
-def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
-    arr = np.frombuffer(image_data, np.uint8)
-    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
-    if img is not None:
-        h, w, _ = img.shape
-        if max(h, w) > max_dim:
-            scale = max_dim / float(max(h, w))
-            new_w = int(w * scale)
-            new_h = int(h * scale)
-            img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
-        encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
-        success, enc = cv2.imencode(".jpg", img, encode_params)
-        if success:
-            return enc.tobytes()
-    return image_data
-def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
-    """
-    Existing Gemini call to classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE.
-    """
-    for attempt in range(max_retries + 1):
-        try:
-            prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
-The three-column 'table' image includes such key features:
-    - Three columns header
-    - Headers like 'Topics', 'Content', 'Guidelines', 'Amplification', 'Additional guidance notes', 'Area of Study'
-    - Possibly sections (e.g. 8.4, 9.1)
-The two-column 'table' image includes such key features:
-    - Two columns
-    - Headers like 'Subject content', 'Additional information'
-    - Possibly sections (e.g. 2.1, 3.4, G2, G3, )
-If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
-If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
-If the image is non-empty but does not show a table, respond with 'NO_TABLE'.
-Return only one of these exact labels.
-"""
-            global _GEMINI_CLIENT
-            if _GEMINI_CLIENT is None:
-                _GEMINI_CLIENT = genai.Client(api_key=api_key)
-            client = _GEMINI_CLIENT
-            resp = client.models.generate_content(
-                model="gemini-2.0-flash",
-                contents=[
-                    {
-                        "parts": [
-                            {"text": prompt},
-                            {
-                                "inline_data": {
-                                    "mime_type": "image/jpeg",
-                                    "data": base64.b64encode(image_data).decode('utf-8')
-                                }
-                            }
-                        ]
-                    }
-                ],
-                config=types.GenerateContentConfig(temperature=0.0)
-            )
-            if resp and resp.text:
-                classification = resp.text.strip().upper()
-                if "THREE" in classification:
-                    return "THREE_COLUMN"
-                elif "TWO" in classification:
-                    return "TWO_COLUMN"
-                elif "EMPTY" in classification:
-                    return "EMPTY_IMAGE"
-            return "NO_TABLE"
-        except Exception as e:
-            logger.error(f"Gemini table classification error: {e}")
-            if "503" in str(e):
-                return "NO_TABLE"
-            if attempt < max_retries:
-                time.sleep(0.5)
-            else:
-                return "NO_TABLE"
-async def classify_image_async(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
-    loop = asyncio.get_event_loop()
-    preprocessed = preprocess_image(image_data)
-    return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
-def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: str, max_retries: int = 1) -> dict:
-    for attempt in range(max_retries + 1):
-        try:
-            prompt = """
-You are given an image from an educational curriculum specification for Gemini Flash 2. The image may contain:
-1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
-2) A subtopic heading in the format "<number>.<number>" or "<number>.<number>.<number>", for example "2.5", "2.6", "3.4", "2.1.1", "4.3.3" or "1.2.1".
-3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
-4) Possibly no relevant text or only truncated text (e.g. "Topics", "Subject content", "What students need to learn", "Content Amplification Additional guidance notes", etc.).
-Your task is to extract:
-- **"title"**: A recognized main topic or heading text.
-- **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4", "G2", "2.1.1", "4.1.1"), as an array of strings.
-Follow these rules:
-(1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued":
-    - Remove the word "continued" if present.
-    - Put that resulting text in "title". (e.g. "2 Algebra and functions")
-    - "subtopics" should be an empty array, unless smaller subtopic numbers (e.g. "2.5") are also detected in the same text.
-(2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4":
-    - Collect those exact strings in the JSON key "subtopics" (an array of strings).
-    - "title" in this case should be an empty string if you only detect subtopics.
-      (Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
-(3) **If no main topic or subtopic is detected but the text appears to be a heading**, for example "Specialisation, division of labour and exchange", then:
-    - Return:
-      {
-        "title": "<the heading text>",
-        "subtopics": []
-      }
-(4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
-    - Use that left column text as "title".
-    - "subtopics" remains empty.
-    Example:
-    If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
-    {
-      "title": "Scarcity, choice and opportunity cost",
-      "subtopics": []
-    }
-(5) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) or it appears to be a standalone column with text, treat it as a heading.
-    - "subtopics" remains empty.
-    Example:
-    If there is only one column image that is "Specialisation, devision of labour and exchange" and the right column is not present, your output is:
-    {
-      "title": "Specialisation, devision of labour and exchange",
-      "subtopics": []
-    }
-(6) **If there is a character + digit pattern** in the left column of a two-column table (for example "G2", "G3", "G4", "C1"), treat that as a topic-like label:
-    - Put that label text into "title" (e.g. "G2").
-    - "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
-(7) **Output must be valid JSON** in this exact structure, with no extra text or explanation:
-    {
-      "title": "...",
-      "subtopics": [...]
-    }
-(8) **If the image is blank or truncated**, defined as:
-    - Contains no words at all (e.g. a blank white or black image), **OR**
-    - Contains only snippet words/phrases such as "Topics", "Subject content", "Content Amplification Additional guidance notes", "What students need to learn" (including variations in background color), **OR**
-    - Contains partial headings with no recognizable numeric or textual headings
-    - Contains partial UI labels only, such as “Topics” in a gray bar or “What students need to learn” in a blue bar, with no additional meaningful text.
-    then return:
-    {
-      "title": "EMPTY_IMAGE",
-      "subtopics": []
-    }
-(9) **If you cannot recognize any text matching the patterns above**, or the text is too partial/truncated to form a valid heading, also return:
-    {
-      "title": "EMPTY_IMAGE",
-      "subtopics": []
-    }
-**Examples**:
-- If the image text is "2 Algebra and functions continued", return:
-  {
-    "title": "2 Algebra and functions",
-    "subtopics": []
-  }
-- If the image text is "2.5 Solve linear and quadratic inequalities ...", return:
-  {
-    "title": "",
-    "subtopics": ["2.5"]
-  }
-- If the image text is "Specialisation, division of labour and exchange" (with no numeric patterns at all), return:
-  {
-    "title": "Specialisation, division of labour and exchange",
-    "subtopics": []
-  }
-- If the left column says "G2" and the right column has details, but no subtopic numbers, return:
-  {
-    "title": "G2",
-    "subtopics": []
-  }
-- If the image is blank or shows only partial/truncated snippet words (e.g. "Topics", "Content Amplification Additional guidance notes", "Subject content", "What students need to learn") and nothing else, return:
-  {
-    "title": "EMPTY_IMAGE",
-    "subtopics": []
-  }
-"""
-            global _GEMINI_CLIENT
-            if _GEMINI_CLIENT is None:
-                _GEMINI_CLIENT = genai.Client(api_key=api_key)
-            client = _GEMINI_CLIENT
-            resp = client.models.generate_content(
-                model="gemini-2.0-flash",
-                contents=[
-                    {
-                        "parts": [
-                            {"text": prompt},
-                            {
-                                "inline_data": {
-                                    "mime_type": "image/jpeg",
-                                    "data": base64.b64encode(image_data).decode("utf-8")
-                                }
-                            }
-                        ]
-                    }
-                ],
-                config=types.GenerateContentConfig(temperature=0.0)
-            )
-            if not resp or not resp.text:
-                logger.warning("Gemini returned an empty response for subtopic extraction.")
-                return {"title": "", "subtopics": []}
-            raw = resp.text.strip()
-            # Remove any markdown fences if present
-            raw = raw.replace("```json", "").replace("```", "").strip()
-            data = json.loads(raw)
-            title = data.get("title", "")
-            subtopics = data.get("subtopics", [])
-            if title.upper() == "EMPTY_IMAGE":
-                return {"title": "EMPTY_IMAGE", "subtopics": []}
-            if not isinstance(subtopics, list):
-                subtopics = []
-            return {"title": title, "subtopics": subtopics}
-        except Exception as e:
-            logger.error(f"Gemini subtopic identification error on attempt {attempt}: {e}")
-            if attempt < max_retries:
-                time.sleep(0.5)
-            else:
-                return {"title": "", "subtopics": []}
-    return {"title": "", "subtopics": []}
-class S3ImageWriter(DataWriter):
     def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
         self.s3_writer = s3_writer
         self.base_path = base_path if base_path.endswith("/") else base_path + "/"
         self.gemini_api_key = gemini_api_key
         self.descriptions = {}
-        self._img_count = 0
-        self.extracted_tables = {}
-        self.extracted_subtopics = {}
     def write(self, path: str, data: bytes) -> None:
-        self._img_count += 1
-        unique_id = f"img_{self._img_count}.jpg"
-        s3_key = f"{self.base_path}{unique_id}"
-        self.s3_writer.write(s3_key, data)
         self.descriptions[path] = {
             "data": data,
-            "s3_path": s3_key,
-            "table_classification": "NO_TABLE",
-            "final_alt": ""
-        }
-    async def post_process_async(self, key: str, md_content: str) -> str:
-        logger.info("Classifying images to detect tables.")
-        tasks = {
-            p: asyncio.create_task(classify_image_async(info["data"], self.gemini_api_key))
-            for p, info in self.descriptions.items()
         }
-        results = await asyncio.gather(*tasks.values(), return_exceptions=True)
-        for p, result in zip(list(self.descriptions.keys()), results):
-            if isinstance(result, Exception):
-                logger.error(f"Table classification error for {p}: {result}")
-                self.descriptions[p]['table_classification'] = "NO_TABLE"
-            else:
-                self.descriptions[p]['table_classification'] = result
-        # Process each image description.
-        for p, info in list(self.descriptions.items()):
-            cls = info['table_classification']
-            if cls == "TWO_COLUMN":
-                info['final_alt'] = "HAS TO BE PROCESSED - two column table"
-            elif cls == "THREE_COLUMN":
-                info['final_alt'] = "HAS TO BE PROCESSED - three column table"
-            elif cls == "EMPTY_IMAGE":
-                md_content = md_content.replace(f"![]({key}{p})", "")
-                try:
-                    self.s3_writer.delete(info['s3_path'])
-                except Exception as e:
-                    logger.error(f"Error deleting S3 object {info['s3_path']}: {e}")
-                del self.descriptions[p]
-                continue
-            else:
-                info['final_alt'] = "NO_TABLE image"
-            md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
-        md_content = await self._process_table_images_in_markdown(key, md_content)
-        # Filter final lines to keep only lines with images.
-        final_lines = [
-            line.strip() for line in md_content.split("\n")
-            if re.match(r"^\!\[.*\]\(.*\)", line.strip())
-        ]
-        return "\n".join(final_lines)
-    async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
-        pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
-        matches = re.findall(pat, md_content, flags=re.IGNORECASE)
-        if not matches:
-            return md_content
-        for (col_type, s3_key) in matches:
-            logger.info(f"Processing table image: {s3_key}, columns={col_type}")
-            img_data = None
-            for desc in self.descriptions.values():
-                if desc.get("s3_path") == s3_key:
-                    img_data = desc.get("data")
-                    break
-            if img_data is None:
-                logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
-                continue
-            # Write temporary file for processing.
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
-                temp_file.write(img_data)
-                temp_path = temp_file.name
-            try:
-                if col_type.lower() == 'two':
-                    extractor = TableExtractor(
-                        skip_header=True,
-                        merge_two_col_rows=True,
-                        enable_subtopic_merge=True,
-                        subtopic_threshold=0.2
-                    )
-                else:
-                    extractor = TableExtractor(
-                        skip_header=True,
-                        merge_two_col_rows=False,
-                        enable_subtopic_merge=False,
-                        subtopic_threshold=0.2
-                    )
-                row_boxes = extractor.process_image(temp_path)
-                out_folder = temp_path + "_rows"
-                os.makedirs(out_folder, exist_ok=True)
-                extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
-                #Group cells by row using file name pattern
-                recognized_main_topic = ""
-                main_topic_image_key = None
-                recognized_subtopics = []
-                header_found = False
-                header_row_index = None
-                # Loop through each row of extracted cells
-                for i, row in enumerate(row_boxes):
-                    row_dir = os.path.join(out_folder, f"row_{i}")
-                    valid_info = None
-                    valid_cell_key = None
-                    for j in range(len(row)):
-                        cell_path = os.path.join(row_dir, f"col_{j}.png")
-                        if not os.path.isfile(cell_path):
-                            alternative_path = os.path.join(row_dir, f"col_{j}.jpg")
-                            if os.path.isfile(alternative_path):
-                                cell_path = alternative_path
-                            else:
-                                logger.warning(f"Cell image not found: {cell_path}")
-                                continue
-                        with open(cell_path, "rb") as cf:
-                            cell_image_data = cf.read()
-                        cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.png"
-                        self.s3_writer.write(cell_key, cell_image_data)
-                        info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
-                        if info.get("title", "").upper() == "EMPTY_IMAGE":
-                            try:
-                                self.s3_writer.delete(cell_key)
-                                logger.info(f"Deleted empty cell image from S3: {cell_key}")
-                            except Exception as e:
-                                logger.error(f"Error deleting empty cell image {cell_key}: {e}")
-                            continue
-                        valid_info = info
-                        valid_cell_key = cell_key
-                        break  # Use only the first valid cell in this row
-                    if valid_info is None:
-                        continue
-                    # First valid row becomes header row.
-                    if not header_found:
-                        header_found = True
-                        header_row_index = i
-                        recognized_main_topic = valid_info.get("title", "")
-                        main_topic_image_key = valid_cell_key
-                    # The row immediately following the header is used for subtopic children.
-                    elif i == header_row_index + 1:
-                        for st in valid_info.get("subtopics", []):
-                            recognized_subtopics.append({
-                                "title": st,
-                                "contents": [{"type": "image", "key": valid_cell_key}],
-                                "children": []
-                            })
-                    else:
-                        # Ignore further rows
-                        continue
-                final_json = {
-                    "title": recognized_main_topic,
-                    "contents": [],
-                    "children": recognized_subtopics
-                }
-                if main_topic_image_key:
-                    final_json["contents"].append({"type": "image", "key": main_topic_image_key})
-                # Save the final JSON.
-                self.extracted_subtopics[s3_key] = final_json
-                # Create a snippet to replace the markdown line.
-                snippet = ["**Extracted table cells:**"]
-                if main_topic_image_key:
-                    snippet.append(f"![Header]({main_topic_image_key})")
-                for child in recognized_subtopics:
-                    for content in child.get("contents", []):
-                        snippet.append(f"![Child]({content.get('key')})")
-                new_snip = "\n".join(snippet)
-                old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
-                md_content = md_content.replace(old_line, new_snip)
-            except Exception as e:
-                logger.error(f"Error processing table image {s3_key}: {e}")
-            finally:
-                os.remove(temp_path)
-        return md_content
     def post_process(self, key: str, md_content: str) -> str:
-        return asyncio.run(self.post_process_async(key, md_content))
-class GeminiTopicExtractor:
-    def __init__(self, api_key: str = None, num_pages: int = 14):
-        self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
-        self.num_pages = num_pages
-    def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
-        first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
-        if not first_pages_text.strip():
-            logger.error("No text from first pages => cannot extract subtopics.")
-            return {}
-        prompt = f"""
-You have the first pages of a PDF specification, including a table of contents.
-Instructions:
-1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
-2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
-3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
-4. Output only valid JSON of the form:
-    {{
-    "Subtopic A": [start_page, end_page],
-    "Subtopic B": [start_page, end_page]
-    }}
-5. If you can't find any subtopics, return an empty JSON.
-Important notes:
-- The correct "end_page" must be the page number of the next topic or subtopic minus 1.
-- The final output must be valid JSON only, with no extra text or code blocks.
-Examples:
-1. Given this table of contents:
-1 Introduction – 2
-    Why choose Edexcel A Level Mathematics? - 2
-    Supporting you in planning and implementing this qualification - 3
-    Qualification at a glance - 5
-2 Subject content and assessment information – 7
-    Paper 1 and Paper 2: Pure Mathematics - 11
-    Paper 3: Statistics and Mechanics - 30
-    Assessment Objectives - 40
-3 Administration and general information – 42
-    Entries - 42
-    Access arrangements, reasonable adjustments, special consideration and malpractice - 42
-    Student recruitment and progression - 45
-Appendix 1: Formulae – 49
-Appendix 2: Notation – 53
-Appendix 3: Use of calculators – 59
-Appendix 4: Assessment Objectives – 60
-Appendix 5: The context for the development of this qualification – 62
-Appendix 6: Transferable skills – 64
-Appendix 7: Level 3 Extended Project qualification – 65
-Appendix 8: Codes – 67
-The correct output should be:
-{{
-    "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
-    "Paper 3: Statistics and Mechanics": [30, 42]
-}}
-2. Given this table of contents:
-Qualification at a glance – 1
-    Assessment Objectives and weightings - 4
-Knowledge, skills and understanding – 5
-    Theme 1: Introduction to markets and market failure - 5
-    Theme 2: The UK economy – performance and policies - 11
-    Theme 3: Business behaviour and the labour market - 21
-    Theme 4: A global perspective - 29
-Assessment – 39
-    Assessment summary - 39
-    Assessment objectives - 41
-    Assessment overview - 42
-    Breakdown of assessment objectives - 42
-        Synoptic assessment - 43
-        Discount code and performance tables - 43
-        Access arrangements, reasonable adjustments and special consideration - 44
-        Malpractice - 45
-        Equality Act 2010 and Pearson equality policy - 45
-        Synoptic assessment - 46
-        Awarding and reporting - 47
-Other information – 49
-    Student recruitment -49
-    Prior learning and other requirements -49
-    Progression - 49
-Appendix 1: Transferable skills – 53
-Appendix 2: Level 3 Extended Project qualification – 55
-Appendix 3: Quantitative skills – 59
-Appendix 4: Codes – 61
-Appendix 5: Index – 63
-The correct output should be:
-{{
-    "Theme 1: Introduction to markets and market failure": [5, 10],
-    "Theme 2: The UK economy – performance and policies": [11, 20],
-    "Theme 3: Business behaviour and the labour market": [21, 28],
-    "Theme 4: A global perspective": [29, 38]
-}}
-3. You might also see sections like:
-2.1 AS Unit 1 11
-2.2 AS Unit 2 18
-2.3 A2 Unit 3 24
-2.4 A2 Unit 4 31
-In that scenario, your output might look like:
-{{
-    "2.1 AS Unit 1": [11, 17],
-    "2.2 AS Unit 2": [18, 23],
-    "2.3 A2 Unit 3": [24, 30],
-    "2.4 A2 Unit 4": [31, 35]
-}}
-or
-2.1 AS units 6
-2.2 AS units 23
-In that scenario, your output might look like:
-{{
-    "2.1 AS Unit 1": [6, 2],
-    "2.2 AS Unit 2": [23, 43]
-}}
-4. Another example might list subtopics:
-3.1 Overarching themes 11
-3.2 A: Proof 12
-3.3 B: Algebra and functions 13
-3.4 C: Coordinate geometry in the ( x , y ) plane 14
-3.5 D: Sequences and series 15
-3.6 E: Trigonometry 16
-3.7 F: Exponentials and logarithms 17
-3.8 G: Differentiation 18
-3.9 H: Integration 19
-3.10 I: Numerical methods 20
-3.11 J: Vectors 20
-3.12 K: Statistical sampling 21
-3.13 L: Data presentation and interpretation 21
-3.14 M: Probability 22
-3.15 N: Statistical distributions 23
-3.16 O: Statistical hypothesis testing 23
-3.17 P: Quantities and units in mechanics 24
-3.18 Q: Kinematics 24
-3.19 R: Forces and Newton’s laws 24
-3.20 S: Moments 25
-3.21 Use of data in statistics 26
-Here the correct output might look like:
-{{
-    "A: Proof": [12, 12],
-    "B: Algebra and functions": [13, 13],
-    ...
-}}
-Now, extract topics from this text:
-{first_pages_text}
-"""
-        global _GEMINI_CLIENT
-        if _GEMINI_CLIENT is None:
-            _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
-        client = _GEMINI_CLIENT
-        try:
-            response = client.models.generate_content(
-                model="gemini-2.0-flash",
-                contents=[prompt],
-                config=types.GenerateContentConfig(temperature=0.0)
-            )
-            if not response or not response.text:
-                logger.warning("No text from LLM => returning empty subtopics.")
-                return {}
-            raw_json = response.text.strip()
-            cleaned = raw_json.replace("```json", "").replace("```", "")
-            try:
-                data = json.loads(cleaned)
-            except Exception as json_err:
-                logger.error(f"JSON parsing error: {json_err}")
-                return {}
-            final_dict = {}
-            found_sub_dict = None
-            for k, v in data.items():
-                if isinstance(v, dict):
-                    found_sub_dict = v
-                    break
-            if found_sub_dict is not None:
-                for subk, rng in found_sub_dict.items():
-                    if isinstance(rng, list) and len(rng) == 2:
-                        final_dict[subk] = rng
-            else:
-                for subk, rng in data.items():
-                    if isinstance(rng, list) and len(rng) == 2:
-                        final_dict[subk] = rng
-            return final_dict
-        except Exception as e:
-            logger.error(f"Gemini subtopic extraction error: {e}")
-            return {}
-    def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
-        text_parts = []
-        try:
-            if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
-                response = requests.get(pdf_path)
-                if response.status_code != 200:
-                    logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
-                    return ""
-                pdf_bytes = response.content
-            else:
-                with open(pdf_path, "rb") as f:
-                    pdf_bytes = f.read()
-            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-            pages_to_read = min(num_pages, doc.page_count)
-            for i in range(pages_to_read):
-                raw_text = doc[i].get_text("raw")
-                text_parts.append(raw_text)
-            doc.close()
-        except Exception as e:
-            logger.error(f"Could not open PDF: {e}")
-        return "\n".join(text_parts)
-class MineruNoTextProcessor:
-    def __init__(self, output_folder: str, gemini_api_key: str):
         self.output_folder = output_folder
         os.makedirs(self.output_folder, exist_ok=True)
         self.layout_model = "doclayout_yolo"
         self.formula_enable = True
         self.table_enable = False
         self.language = "en"
-        self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
-        self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
-        self.use_s3 = True
         self.s3_writer = s3Writer(
             ak=os.getenv("S3_ACCESS_KEY"),
             sk=os.getenv("S3_SECRET_KEY"),
@@ -880,110 +125,106 @@ class MineruNoTextProcessor:
         except Exception as e:
             logger.error(f"Error during GPU cleanup: {e}")
-    def process(self, pdf_path: str) -> Dict[str, Any]:
-        logger.info(f"Processing PDF: {pdf_path}")
-        try:
-            subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
-            logger.info(f"Gemini returned subtopics: {subtopics}")
-            if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
-                response = requests.get(pdf_path)
-                if response.status_code != 200:
-                    logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
-                    raise Exception(f"Failed to download PDF: {pdf_path}")
-                pdf_bytes = response.content
-                logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
-            else:
-                with open(pdf_path, "rb") as f:
-                    pdf_bytes = f.read()
-                logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
-            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-            total_pages = doc.page_count
-            doc.close()
-            # Decide which pages to process
-            final_pages = set()
-            if not subtopics:
-                # fallback
-                final_pages = set(range(total_pages))
-            else:
-                offset_candidates = []
-                for subname, rng in subtopics.items():
-                    start_p, _ = rng
-                    occs = find_all_occurrences(pdf_bytes, subname)
-                    for p in occs:
-                        candidate = p - (start_p - 1)
-                        if candidate > 0:
-                            offset_candidates.append(candidate)
-                if offset_candidates:
-                    try:
-                        from statistics import mode
-                        global_offset = mode(offset_candidates)
-                    except:
-                        from statistics import median
-                        global_offset = int(median(offset_candidates))
-                else:
-                    global_offset = 0
-                logger.info(f"Computed global offset: {global_offset}")
-                for subname, rng in subtopics.items():
-                    if not (isinstance(rng, list) and len(rng) == 2):
-                        continue
-                    start_p, end_p = rng
-                    if start_p > end_p:
-                        continue
-                    s0 = (start_p - 1) + global_offset
-                    e0 = (end_p - 1) + global_offset
-                    for pp in range(s0, e0 + 1):
-                        final_pages.add(pp)
-            if not final_pages:
-                final_pages = set(range(total_pages))
-            logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
-            subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
-            # 4) Analyze and produce markdown
-            dataset = PymuDocDataset(subset_pdf_bytes)
-            inference = doc_analyze(
-                dataset,
-                ocr=True,
-                lang=self.language,
-                layout_model=self.layout_model,
-                formula_enable=self.formula_enable,
-                table_enable=self.table_enable
-            )
-            # S3
-            writer = S3ImageWriter(self.s3_writer, "/topic-extraction", self.gemini_api_key)
-            md_prefix = "/topic-extraction/"
-            pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
-            md_content = pipe_result.get_markdown(md_prefix)
-            final_markdown = writer.post_process(md_prefix, md_content)
-            subtopic_list = list(writer.extracted_subtopics.values())
-            subtopic_list = merge_topics(subtopic_list)
-            out_path = os.path.join(self.output_folder, "_subtopics.json")
-            with open(out_path, "w", encoding="utf-8") as f:
-                json.dump(subtopic_list, f, indent=2)
-            logger.info(f"Final subtopics JSON saved locally at {out_path}")
-            return {
-                "final_markdown": final_markdown,
-                "subtopics_extracted": subtopic_list
-            }
-        finally:
-            self.cleanup_gpu()
 if __name__ == "__main__":
-    input_pdf = "/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf"
-    output_dir = "/home/user/app/pearson_json"
-    gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
-    try:
-        processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
-        result = processor.process(input_pdf)
-        logger.info("Processing completed successfully.")
-    except Exception as e:
-        logger.error(f"Processing failed: {e}")

 #!/usr/bin/env python3
 import os
+import sys
 import json
 import logging
+import gc
 import fitz
 import requests
 import torch
+import boto3
+import re
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+def create_subset_pdf(original_pdf_bytes: bytes, page_indices: list) -> bytes:
     if not page_indices:
         raise ValueError("No page indices provided for subset creation.")
     doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
     doc.close()
     return subset_bytes
+def parse_page_range(page_field) -> list:
     """
+    Parse the 'page' field from the JSON input.
+    It can be either:
+      • a list of integers:
+          - If the list contains exactly two integers, treat them as a range [start, end] (inclusive start, exclusive end).
+          - Otherwise, treat the list as a sequence of individual pages.
+      • a string:
+          - Either a comma-separated range "start, end" or a comma-separated list of pages.
+    The numbers are assumed to be 1-indexed and are converted to 0-indexed.
     """
+    if isinstance(page_field, list):
+        if len(page_field) == 2:
+            start, end = page_field
+            return list(range(start - 1, end))
         else:
+            return [int(p) - 1 for p in page_field]
+    elif isinstance(page_field, str):
+        parts = [p.strip() for p in page_field.split(',')]
+        if len(parts) == 2:
+            start, end = int(parts[0]), int(parts[1])
+            return list(range(start - 1, end))
+        else:
+            return [int(p) - 1 for p in parts]
+    else:
+        logger.error("Invalid type for page field. Must be list or string.")
+        raise ValueError("Invalid page field type.")
 class s3Writer:
     def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
     def write(self, path: str, data: bytes) -> None:
         try:
+            from io import BytesIO
             file_obj = BytesIO(data)
+            self.client.upload_fileobj(file_obj, self.bucket, path)
             logger.info(f"Uploaded to S3: {path}")
         except Exception as e:
             logger.error(f"Failed to upload to S3: {str(e)}")
             raise
+class S3ImageWriter:
     def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
         self.s3_writer = s3_writer
         self.base_path = base_path if base_path.endswith("/") else base_path + "/"
         self.gemini_api_key = gemini_api_key
         self.descriptions = {}
     def write(self, path: str, data: bytes) -> None:
+        full_path = f"{self.base_path}{os.path.basename(path)}"
+        self.s3_writer.write(full_path, data)
         self.descriptions[path] = {
             "data": data,
+            "s3_path": full_path
         }
     def post_process(self, key: str, md_content: str) -> str:
+        for path, info in self.descriptions.items():
+            s3_path = info.get("s3_path")
+            md_content = md_content.replace(f"![]({key}{path})", f"![]({s3_path})")
+        return md_content
+class TopicExtractionProcessor:
+    def __init__(self, gemini_api_key: str, s3_config: dict, output_folder: str):
+        self.gemini_api_key = gemini_api_key
         self.output_folder = output_folder
         os.makedirs(self.output_folder, exist_ok=True)
         self.layout_model = "doclayout_yolo"
         self.formula_enable = True
         self.table_enable = False
         self.language = "en"
         self.s3_writer = s3Writer(
             ak=os.getenv("S3_ACCESS_KEY"),
             sk=os.getenv("S3_SECRET_KEY"),
         except Exception as e:
             logger.error(f"Error during GPU cleanup: {e}")
+    def process_input_file(self, input_file: dict) -> str:
+        key = input_file.get("key", "")
+        url = input_file.get("url", "")
+        page_field = input_file.get("page")
+        if not url or not page_field:
+            logger.error("Input file must contain 'url' and 'page' fields.")
+            raise ValueError("Missing 'url' or 'page' in input file.")
+        page_indices = parse_page_range(page_field)
+        logger.info("Using page indices (0-indexed): %s", page_indices)
+        # Retrieve PDF bytes (supports URL or local file)
+        if url.startswith("http://") or url.startswith("https://"):
+            response = requests.get(url)
+            if response.status_code != 200:
+                logger.error("Failed to download PDF from %s. Status code: %d", url, response.status_code)
+                raise Exception(f"Failed to download PDF: {url}")
+            pdf_bytes = response.content
+        else:
+            with open(url, "rb") as f:
+                pdf_bytes = f.read()
+        subset_pdf_bytes = create_subset_pdf(pdf_bytes, page_indices)
+        logger.info("Created subset PDF with %d pages", len(page_indices))
+        dataset = PymuDocDataset(subset_pdf_bytes)
+        inference = doc_analyze(
+            dataset,
+            ocr=True,
+            lang=self.language,
+            layout_model=self.layout_model,
+            formula_enable=self.formula_enable,
+            table_enable=self.table_enable
+        )
+        base_path = f"/topic-extraction/{key}/"
+        writer = S3ImageWriter(self.s3_writer, "/topic-extraction/", self.gemini_api_key)
+        md_prefix = "/topic-extraction/"
+        pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
+        md_content = pipe_result.get_markdown(md_prefix)
+        final_markdown = writer.post_process(md_prefix, md_content)
+        output_md_path = os.path.join(self.output_folder, f"{key}_output.md")
+        with open(output_md_path, "w", encoding="utf-8") as f:
+            f.write(final_markdown)
+        logger.info("Markdown output saved to %s", output_md_path)
+        self.cleanup_gpu()
+        return final_markdown
+def main():
+    message = {
+        "pattern": "topic_extraction",
+        "data": {
+            "input_files": [
+                {
+                    "key": "sample_spec",
+                    "url": "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf",
+                    "type": "specification",
+                    "page": [
+                        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+                        28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41
+                    ]
+                }
+            ],
+            "topics": [
+                {
+                    "title": "Sample Topic",
+                    "id": 123
+                }
+            ]
+        }
+    }
+    data = message.get("data", {})
+    input_files = data.get("input_files", [])
+    output_folder = "output"
+    gemini_api_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
+    s3_config = {
+        "ak": os.getenv("S3_ACCESS_KEY"),
+        "sk": os.getenv("S3_SECRET_KEY"),
+        "bucket": "quextro-resources",
+        "endpoint_url": os.getenv("S3_ENDPOINT")
+    }
+    processor = TopicExtractionProcessor(
+        gemini_api_key=gemini_api_key,
+        s3_config=s3_config,
+        output_folder=output_folder
+    )
+    for input_file in message["data"].get("input_files", []):
+        try:
+            logger.info("Processing input file with key: %s", input_file.get("key", ""))
+            final_md = processor.process_input_file(input_file)
+            logger.info("Processing completed for key: %s", input_file.get("key", ""))
+        except Exception as e:
+            logger.error("Error processing input file: %s", e)
 if __name__ == "__main__":
+    main()